In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('fake_news_dataset.csv')

In [3]:
print(df.head())

                                  title  \
0               Foreign Democrat final.   
1   To offer down resource great point.   
2          Himself church myself carry.   
3                  You unit its should.   
4  Billion believe employee summer how.   

                                                text        date    source  \
0  more tax development both store agreement lawy...  2023-03-10  NY Times   
1  probably guess western behind likely next inve...  2022-05-25  Fox News   
2  them identify forward present success risk sev...  2022-09-01       CNN   
3  phone which item yard Republican safe where po...  2023-02-07   Reuters   
4  wonder myself fact difficult course forget exa...  2023-04-03       CNN   

                 author    category label  
0          Paula George    Politics  real  
1           Joseph Hill    Politics  fake  
2        Julia Robinson    Business  fake  
3  Mr. David Foster DDS     Science  fake  
4         Austin Walker  Technology  fake  


In [4]:
df['content'] = df['title'] + ' ' + df['text']

In [5]:
X = df['content']
y = df['label']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)

In [12]:
# Check for and remove missing values in y_train
nan_indices = y_train.isna()
X_train_cleaned = X_train[~nan_indices]
y_train_cleaned = y_train[~nan_indices]

# Fit the model with the cleaned data
model = LogisticRegression()
model.fit(vectorizer.transform(X_train_cleaned), y_train_cleaned)

In [13]:
y_pred = model.predict(tfidf_test)

In [14]:
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy Score: 0.5021398002853067

Classification Report:
               precision    recall  f1-score   support

        fake       0.52      0.51      0.51      1440
        real       0.49      0.49      0.49      1364

    accuracy                           0.50      2804
   macro avg       0.50      0.50      0.50      2804
weighted avg       0.50      0.50      0.50      2804


Confusion Matrix:
 [[735 705]
 [691 673]]
