In [None]:
import pandas as pd

df_fake = pd.read_csv('../data/archive/Fake.csv')
df_true = pd.read_csv('../data/archive/True.csv')

#adding labels
df_fake['label'] = "FAKE"
df_true['label'] = "TRUE"

#combine them into one dataframe
df = pd.concat([df_fake, df_true], axis=0).reset_index(drop=True)

#show shape of first few rows
print("Shape:", df.shape)
df.head(10)


In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
#combine title and text into a single column
df['combined_text'] = df['title'] + " " + df['text']

#check it
df[['combined_text', 'label']].head()

In [None]:
import re

#now we'll just lowercase all of the text and remove the numbers and punctuation

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove everything except letters and spaces
    return text

# clean it
df['clean_text'] = df['combined_text'].apply(clean_text)

# check the cleaned text
df[['clean_text', 'label']].head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the clean_text column
X = tfidf.fit_transform(df['clean_text'])

# Target variable
y = df['label']

In [None]:
# NOW WE'LL WORK ON TRAINING THE DATASET

#split the dataset into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
#initialise and train a logistic regression model
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
#make predictions and then evaluate the model
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)

# Display it
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix")
plt.show()