In [1]:
import pandas as pd

fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

# Add labels
fake['label'] = 0  # 0 = fake
real['label'] = 1  # 1 = real

# Combine the datasets
data = pd.concat([fake, real])

# Shuffle data
data = data.sample(frac=1).reset_index(drop=True)

# Check data
print(data.head())


                                               title  \
0  TEACHER QUITS JOB After 5th, 6th Grade Muslim ...   
1  NC TEACHER SIGNS UP FIRST GRADERS For Black Li...   
2  WATCH: TRANS ARTIST DEMONSTRATES How She Colle...   
3   London’s New Muslim Mayor Has Some VERY Choic...   
4  Top Democrat says not clear if Americans helpe...   

                                                text       subject  \
0  You re never to young to commit jihad Teachers...     left-news   
1   This so-called movement is out of control and...     left-news   
2  So much hate. So much anger. So much time on t...     left-news   
3  While the United States has as one of its pres...          News   
4  WASHINGTON (Reuters) - U.S. Representative Ada...  politicsNews   

              date  label  
0      May 9, 2017      0  
1     Mar 17, 2016      0  
2     Oct 21, 2017      0  
3     May 13, 2016      0  
4  March 20, 2017       1  


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()  # Lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stopwords and stem
    return ' '.join(words)

data['text'] = data['title'] + " " + data['text']  # Combine title and article
data['text'] = data['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text']).toarray()
y = data['label']
print(X.shape)      # Shows (number of samples, 5000 features)
print(X[:2])        # Shows first 2 rows of TF-IDF matrix
print(y[:10])  

(44898, 5000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
0    0
1    0
2    0
3    0
4    1
5    0
6    1
7    0
8    1
9    1
Name: label, dtype: int64


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Try one model
model = LogisticRegression()
# model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9902004454342984
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4640
           1       0.99      0.99      0.99      4340

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [9]:
import pickle

# Save model and vectorizer
pickle.dump(model, open('model.pkl', 'wb'))
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))
print("✅ Model and vectorizer saved successfully!")

✅ Model and vectorizer saved successfully!


In [10]:
import os

print("Files in current directory:")
print(os.listdir())


Files in current directory:
['.anaconda', '.conda', '.condarc', '.continuum', '.ipynb_checkpoints', '.ipython', '.jupyter', '.matplotlib', '.vscode', '3D Objects', 'anaconda3', 'AppData', 'Application Data', 'breast-cancer.csv', 'Contacts', 'Cookies', 'data.csv', 'Desktop', 'Documents', 'Downloads', 'Fake.csv', 'Favorites', 'heart.csv', 'heart_decision_tree', 'IntelGraphicsProfiles', 'Iris.csv', 'iris_tree', 'Links', 'Local Settings', 'Mall_Customers.csv', 'model.pkl', 'Music', 'My Documents', 'NetHood', 'NTUSER.DAT', 'ntuser.dat.LOG1', 'ntuser.dat.LOG2', 'NTUSER.DAT{c7bd9da4-51c2-11f0-8aaa-f1434417f81f}.TxR.0.regtrans-ms', 'NTUSER.DAT{c7bd9da4-51c2-11f0-8aaa-f1434417f81f}.TxR.1.regtrans-ms', 'NTUSER.DAT{c7bd9da4-51c2-11f0-8aaa-f1434417f81f}.TxR.2.regtrans-ms', 'NTUSER.DAT{c7bd9da4-51c2-11f0-8aaa-f1434417f81f}.TxR.blf', 'NTUSER.DAT{c7bd9da5-51c2-11f0-8aaa-f1434417f81f}.TM.blf', 'NTUSER.DAT{c7bd9da5-51c2-11f0-8aaa-f1434417f81f}.TMContainer00000000000000000001.regtrans-ms', 'NTUSER.DAT{c