In [1]:
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
!pip install nltk
nltk.download('stopwords')




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from google.colab import files
uploaded = files.upload()


Saving Fake.csv to Fake.csv
Saving True.csv to True.csv


In [4]:
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

fake['label'] = 0   # 0 for fake news
true['label'] = 1   # 1 for real news

# Combine them
df = pd.concat([fake, true])
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the rows


In [5]:
df.head()


Unnamed: 0,title,text,subject,date,label
0,STUNNING DEVELOPMENT: OBAMA GIVES $75 MILLION ...,You have got to be kidding me! After all the t...,politics,"Sep 17, 2016",0
1,White House says opposes House bill on restaur...,WASHINGTON (Reuters) - The White House said on...,politicsNews,"February 10, 2016",1
2,BREAKING: At Least 14 US Coalition Military Of...,Syrian Army soldier holds up Al Nusra Front (...,Middle-east,"December 16, 2016",0
3,Unclear if anyone will attend Mugabe cabinet m...,HARARE (Reuters) - Zimbabwe s information mini...,worldnews,"November 21, 2017",1
4,Ex-Catalan leader urges unity as window for se...,MADRID (Reuters) - Catalonia s deposed leader ...,worldnews,"November 7, 2017",1


In [6]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


In [7]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = ''.join([c for c in text if c not in string.punctuation])  # remove punctuation
    words = text.split()  # tokenize
    words = [word for word in words if word not in stop_words]  # remove stopwords
    return ' '.join(words)


In [8]:
df['text'] = df['text'].apply(clean_text)


In [9]:
df[['text', 'label']].head()


Unnamed: 0,text,label
0,got kidding times trump attacked democratic si...,0
1,washington reuters white house said wednesday ...,1
2,syrian army soldier holds al nusra front al qa...,0
3,harare reuters zimbabwe information minister s...,1
4,madrid reuters catalonia deposed leader carles...,1


In [10]:
tfidf = TfidfVectorizer(max_features=5000)  # You can increase limit later if needed
X = tfidf.fit_transform(df['text']).toarray()
y = df['label']


In [11]:
print("TF-IDF matrix shape:", X.shape)
print("Sample label array:", y[:5])


TF-IDF matrix shape: (44898, 5000)
Sample label array: 0    0
1    1
2    0
3    1
4    1
Name: label, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [13]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [14]:
y_pred = model.predict(X_test)


In [15]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9457683741648107

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.94      0.95      4590
           1       0.93      0.96      0.95      4390

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980


Confusion Matrix:
 [[4297  293]
 [ 194 4196]]


In [16]:
import pickle

# Save model
with open('news_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save TF-IDF vectorizer
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)


In [19]:
from google.colab import files

files.download('news_model.pkl')
files.download('tfidf.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
df[df['label'] == 0]['text'].sample(1).values[0]


'religion peace tolerance strikes shopkeeper murdered fellow muslim wished christian friends peaceful easterasad shah stabbed 30 times shop praised life jesus beloved christian nation left lying pool blood 40yearold died hospital police questioning 32yearold suspect last night said killing religiously motivatedmohammad faisal family friend said bearded muslim wearing long religious robe entered mr shah shop spoke native language stabbing head kitchen knifemr shah brother working next door rushed find killer laughing sitting glasgow newsagent bleeding chest brother dragged mr shah away guy continued attacking blade said mr faisal struggled bus stop asad collapsed clearcut revenge attack posting messages peace messages greeting fellow christians jews video asad shah posted inspired brutal murder fellow muslim death mr shah wished friends good friday happy easter especially beloved christian nation final post wrote let follow real footstep beloved holy jesus christ get real success worlds

In [21]:
df[df['label'] == 1]['text'].sample(1).values[0]


'beijing reuters china said tuesday would take actions defend interests united states damages trade ties us president donald trump authorized inquiry china’s alleged theft intellectual property nl2n1l01g0 united states respect objective facts act prudently destroy principles multilateralism china’s ministry commerce said statement'