In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib


In [3]:
true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [4]:
true['label'] = 1
fake['label'] = 0

In [5]:
news = pd.concat([fake, true], axis=0)

In [6]:
news = news.sample(frac=1).reset_index(drop=True)

In [7]:
news['content'] = news['title'] + " " + news['text']

In [8]:
def wordopt(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\n', ' ', text)
    return text

In [9]:
news['content'] = news['content'].apply(wordopt)

In [10]:
x = news['content']
y = news['label']


In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


In [12]:
vectorization = TfidfVectorizer()
x_train = vectorization.fit_transform(x_train)
x_test = vectorization.transform(x_test)


In [13]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
joblib.dump(logistic_model, 'logistic_model.pkl')

['logistic_model.pkl']

In [14]:
dtc = DecisionTreeClassifier()
dtc_params = {'max_depth': [10, 50, 100], 'min_samples_split': [2, 10, 20]}
dtc_grid = GridSearchCV(dtc, dtc_params, cv=5)
dtc_grid.fit(x_train, y_train)
joblib.dump(dtc_grid.best_estimator_, 'dtc_model.pkl')

['dtc_model.pkl']

In [15]:
rfc = RandomForestClassifier()
rfc_params = {'n_estimators': [100, 200], 'max_depth': [10, 50], 'min_samples_split': [2, 10]}
rfc_grid = GridSearchCV(rfc, rfc_params, cv=5)
rfc_grid.fit(x_train, y_train)
joblib.dump(rfc_grid.best_estimator_, 'rfc_model.pkl')

['rfc_model.pkl']

In [16]:
joblib.dump(vectorization, 'vectorizer.pkl')


['vectorizer.pkl']

In [17]:
models = {
    'Logistic Regression': logistic_model,
    'Decision Tree': dtc_grid.best_estimator_,
    'Random Forest': rfc_grid.best_estimator_
}

In [18]:
for model_name, model in models.items():
    y_pred = model.predict(x_test)
    print(f"{model_name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.9892204899777283
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5910
           1       0.99      0.99      0.99      5315

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225

Decision Tree Accuracy: 0.9962583518930958
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5910
           1       1.00      1.00      1.00      5315

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225

Random Forest Accuracy: 0.9906458797327394
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5910
           1       0.99      0.99      0.99      5315

    accuracy                           0.99    