In [1]:
import pandas as pd
import re
import string
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from utils import clean_text

In [2]:
fake = pd.read_csv("dataset/fake.csv")
true = pd.read_csv("dataset/true.csv")

In [3]:
fake["label"] = 0
true["label"] = 1

df = pd.concat([fake, true])
df = df.sample(frac=1).reset_index(drop=True)
df["text"] = df["text"].apply(clean_text)

X = df["text"]
y = df["label"]

In [4]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.7)),
    ('clf', LogisticRegression(solver='liblinear'))
])

In [5]:
params = {
    'clf__C': [0.1, 1, 10],
    'tfidf__ngram_range': [(1,1), (1,2)]
}

grid = GridSearchCV(pipeline, params, cv=5, scoring='accuracy', verbose=1)
grid.fit(X, y)

print("Best Parameters:", grid.best_params_)
print(classification_report(y, grid.predict(X)))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'clf__C': 10, 'tfidf__ngram_range': (1, 1)}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23481
           1       1.00      1.00      1.00     21417

    accuracy                           1.00     44898
   macro avg       1.00      1.00      1.00     44898
weighted avg       1.00      1.00      1.00     44898



In [6]:
with open("model/fake_news_model.pkl", "wb") as f:
    pickle.dump(grid.best_estimator_, f)

with open("model/vectorizer.pkl", "wb") as f:
    pickle.dump(grid.best_estimator_.named_steps['tfidf'], f)