In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
data = pd.read_csv('fake.csv')
data

Unnamed: 0.1,Unnamed: 0,author,title,label,article
0,295,Brian J. O’Connor,Panic in the Parking Lot for Cash,REAL,Low interest rates and paltry yields prompt mo...
1,3383,Monique Curet,"""British data show the COVID shots are an abys...",FAKE,
2,5184,Daniel Funke,"""We won. We won in a landslide. This was a lan...",FAKE,
3,875,Adam Nossiter,"Khaled Nezzar, General at Center of Algeria’s ...",REAL,"In the 1990s, he oversaw troops that committed..."
4,531,Rory Smith,The Best Player in the Premier League? Look De...,REAL,Rodri’s genius is not in making things happen....
...,...,...,...,...,...
26519,1101,"Brent McDonald, Miguel Soffia and Kristen Will...",‘No Water in the Hydrants’: Communities Left D...,REAL,"Weeks after Chile’s deadliest wildfire, some f..."
26520,5556,Tom Kertscher,"The coronavirus ""was constructed in the Wuhan ...",FAKE,
26521,882,Tom Kertscher,“Skin cancer is a relatively new phenomenon in...,FAKE,
26522,2591,Yacob Reyes,"""Ron DeSantis still hasn't condemned the Jan. ...",FAKE,


In [None]:
data['label'].value_counts()

FAKE    14424
REAL    12100
Name: label, dtype: int64

In [None]:
num_real = data[data['label'] == 'REAL'].shape[0]
num_fake = data[data['label'] == 'FAKE'].shape[0]
if num_fake > num_real:
    # Calculate the number of excess fake data points
    excess_fake = num_fake - num_real

    # Get the indices of fake data points to remove
    fake_indices_to_remove = data[data['label'] == 'FAKE'].sample(n=excess_fake, random_state=42).index

    # Remove excess fake data points from the DataFrame
    df_balanced = data.drop(index=fake_indices_to_remove)

    # Now df_balanced contains approximately equal number of real and fake data points
    print(df_balanced['label'].value_counts())

REAL    12100
FAKE    12100
Name: label, dtype: int64


In [None]:
data = data.drop(columns=['article', 'Unnamed: 0'])
data

Unnamed: 0,author,title,label
0,Brian J. O’Connor,Panic in the Parking Lot for Cash,REAL
1,Monique Curet,"""British data show the COVID shots are an abys...",FAKE
2,Daniel Funke,"""We won. We won in a landslide. This was a lan...",FAKE
3,Adam Nossiter,"Khaled Nezzar, General at Center of Algeria’s ...",REAL
4,Rory Smith,The Best Player in the Premier League? Look De...,REAL
...,...,...,...
26519,"Brent McDonald, Miguel Soffia and Kristen Will...",‘No Water in the Hydrants’: Communities Left D...,REAL
26520,Tom Kertscher,"The coronavirus ""was constructed in the Wuhan ...",FAKE
26521,Tom Kertscher,“Skin cancer is a relatively new phenomenon in...,FAKE
26522,Yacob Reyes,"""Ron DeSantis still hasn't condemned the Jan. ...",FAKE


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[['author','title']], data.label, test_size=0.25)
from sklearn.feature_extraction.text import CountVectorizer
VC = CountVectorizer(stop_words='english')
X_train_vc = VC.fit_transform(X_train['author']+' '+X_train['title'])

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_vc, y_train)
X_test_vc = VC.transform(X_test['author']+' '+X_test['title'])
y_pred = model.predict(X_test_vc)
y_pred

array(['FAKE', 'REAL', 'FAKE', ..., 'REAL', 'FAKE', 'FAKE'], dtype='<U4')

In [None]:
import joblib
import json

joblib.dump(VC, 'model_nb_vocab.joblib')
joblib.dump(model, 'multinomial_nb_model.joblib')

with open('vocabulary.json', 'w') as f:
    json.dump(VC.vocabulary_, f)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.title, data.label, test_size=0.25)
from sklearn.feature_extraction.text import CountVectorizer
VC = CountVectorizer(stop_words='english')
X_train_vc = VC.fit_transform(X_train.values)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=100)
rfc.fit(X_train_vc, y_train)
X_test_vc = VC.transform(X_test.values)
y_pred = rfc.predict(X_test_vc)
y_pred

array(['REAL', 'REAL', 'FAKE', ..., 'REAL', 'REAL', 'FAKE'], dtype=object)

In [None]:
author = 'Madeline Halpert & Brandon Drenon '
title = "Nathan Wade, the special prosecutor in Donald Trump's Georgia election interference case, has resigned after a judge said his affair with District Attorney Fani Willis was inappropriate."
news = author+' '+title
user = VC.transform([news])
model.predict(user)

array(['FAKE'], dtype='<U4')

In [None]:

title = [ "White House Announces Plans for Diplomatic Talks with Russia to Address Escalating Tensions", "Congressional Committee Launches Investigation into Allegations of Government Corruption",
         "Supreme Court Nominee Faces Senate Confirmation Hearings Amid Partisan Debate", "Senate Passes Bipartisan Bill to Strengthen Cybersecurity Measures and Protect Against Foreign Threats",
         "President Signs Executive Order on Climate Change, Pledges to Rejoin Paris Agreement"]
for i in range(len(title)):
  user = VC.transform([title[i]])
  print(i,rfc.predict(user))

0 ['REAL']
1 ['FAKE']
2 ['FAKE']
3 ['FAKE']
4 ['FAKE']


In [None]:
author = ['Jeff Cercone', 'Maria Ramirez Uribe', 'Hope Karnopp']
title = ["The Chinese government “owns” TikTok’s parent company, ByteDance.",
         "Biden has implemented a formal policy that illegal aliens who intrude into the United States are granted immunity from deportation.",
         "Since the end of the 2020 fiscal year, President Biden and Senator Baldwin have added over $7.3 trillion of debt, more than the first 228 years of our nation’s history combined."]

for i in range(len(author)):
  news = author[i]+' '+title[i]
  user = VC.transform([news])
  print(i,model.predict(user))

0 ['FAKE']
1 ['FAKE']
2 ['FAKE']


In [None]:
author = "Raja Abdulrahim and Ameera Harouda"
title = "Witnesses Describe Fear and Deprivation at Besieged Hospital in Gaza"
news = VC.transform([author+' '+title])
model.predict(news)

array(['FAKE'], dtype='<U4')

In [None]:
import joblib
joblib.dump(rfc, 'Fake_News_model.pkl')

['Fake_News_model.pkl']

In [None]:
import joblib
import json
import numpy as np

# Save the vocabulary of CountVectorizer
with open('count_vectorizer_vocabulary.json', 'w') as f:
    json.dump(VC.vocabulary_, f)

# Save the parameters of RandomForestClassifier
model_params = {
    'n_estimators': rfc.n_estimators,
    'max_depth': rfc.max_depth,
    # Add other parameters if needed
}
with open('random_forest_classifier_params.json', 'w') as f:
    json.dump(model_params, f)

# Convert NumPy arrays to lists for serialization
def convert_numpy_arrays(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

# Save the decision trees of RandomForestClassifier
decision_trees_serializable = []
for estimator in rfc.estimators_:
    decision_trees_serializable.append(estimator.tree_.__getstate__())
with open('random_forest_decision_trees.json', 'w') as f:
    json.dump(decision_trees_serializable, f, default=convert_numpy_arrays)
