In [31]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [32]:
# combine and label data

import pandas as pd

df_fake = pd.read_csv("./data/Fake.csv")
df_real = pd.read_csv("./data/True.csv")

df_fake['label'] = 1  # fake
df_real['label'] = 0  # real

df = pd.concat([df_fake, df_real], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle

In [33]:
# look at data

df.head()
df['label'].value_counts()


label
1    23481
0    21417
Name: count, dtype: int64

In [34]:
!pip install nltk



In [35]:
# clean text; focus on 'text'

import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)  # remove punctuation
    return text

df['cleaned'] = df['text'].apply(clean_text)


In [36]:
!pip install scikit-learn



In [37]:
# vectorize text!

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df['cleaned'])
y = df['label']


In [38]:
# split into testing and training sets (20% for testing)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)


In [39]:
# create instance of model, then train it

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [40]:
# evaluate model using a confusion matrix

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[4219   51]
 [  89 4621]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      4270
           1       0.99      0.98      0.99      4710

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [41]:
# compare with RandomForestClassifier and MultinomialNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Create models
rf_model = RandomForestClassifier(random_state=42)
nb_model = MultinomialNB()

# Train models
rf_model.fit(X_train, y_train)
nb_model.fit(X_train, y_train)

# Predict on test data
rf_pred = rf_model.predict(X_test)
nb_pred = nb_model.predict(X_test)

# Evaluate Random Forest
print("Random Forest Classifier:")
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

# Evaluate MultinomialNB
print("Multinomial Naive Bayes:")
print(confusion_matrix(y_test, nb_pred))
print(classification_report(y_test, nb_pred))


Random Forest Classifier:
[[4257   13]
 [  12 4698]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4270
           1       1.00      1.00      1.00      4710

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

Multinomial Naive Bayes:
[[3929  341]
 [ 319 4391]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      4270
           1       0.93      0.93      0.93      4710

    accuracy                           0.93      8980
   macro avg       0.93      0.93      0.93      8980
weighted avg       0.93      0.93      0.93      8980

