<a href="https://colab.research.google.com/github/kv772/Accenture1D_AIStudio/blob/main/Accenture_1D_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Accenture 1D: Content Moderation

In [None]:
# Mandatory Installs
# Used for language analysis
%pip install langdetect
%pip install wordfreq

In [None]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from multiprocessing import Pool
from wordfreq import zipf_frequency
from multiprocessing import Pool
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import sys, subprocess
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# File Upload

In [None]:
# Connect Google Colab and Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
datasets = "/content/drive/MyDrive/BTT Accenture 1D!/Fake News Detection Datasets"

In [None]:
true_df = pd.read_csv(f"{datasets}/True.csv")
fake_df = pd.read_csv(f"{datasets}/Fake.csv")
print(true_df.shape)
print(fake_df.shape)

# Exploratory Data Analysis

## Location Analysis (Adriena Jiang)

**What is this about:** This analysis is checking the number of articles (in both datasets) that contain a location (specifically a city or a country).

**What I found:** 19588/21417 true articles contain a location and 18862/23481 fake articles contain a location.

**Why this matters:** Initially as a team we thought that location might have caused our model to overfit as it seemed location was always attached to Reuter mentions. Turns out most articles in both data sets contain location.

**What I suggest**: I suggest we keep locations. Location mentions are throughout both data sets so there is no need to clean them out.

In [None]:
# Analyzing how many articles have location attached

# GeoText has built in list of cities and countries
!pip install geotext
from geotext import GeoText

# Function to check if an article contains a location
# Returns true if the article text contains a city or country name
# Returns false otherwise
def contains_location(text):
    # Check that text is a string and not empty
    if not isinstance(text, str) or not text.strip():
        return False
    # Creates a geotext object for the article body
    places = GeoText(text)
    # Check if it contains cities or countries
    return bool(places.cities or places.countries)

# Add a boolean column for each dataset
true_df["has_location"] = true_df["text"].astype(str).apply(contains_location)
fake_df["has_location"] = fake_df["text"].astype(str).apply(contains_location)

# Statistics
for name, df in [("True", true_df), ("Fake", fake_df)]:
    total = len(df)
    loc_count = df["has_location"].sum()
    print(f"{name} articles with a detected location: " f"{loc_count} / {total} ({loc_count/total:.1%})")


In [None]:
# Look at the new column in the true dataset
true_df.head()

In [None]:
# Look at the new column in the fake dataset
fake_df.head()

## Punctuation, Misspelling, and Emoji Analysis (Ousman)

In [None]:
!pip install emoji pyspellchecker
import re
import emoji
from collections import Counter
from spellchecker import SpellChecker

spell = SpellChecker()

def count_punctuation(text):
    return len(re.findall(r'[^\w\s]', str(text)))

def extract_emojis(text):
    return [c for c in str(text) if c in emoji.EMOJI_DATA]

def count_misspellings(text):
    words = re.findall(r'\b[a-zA-Z]+\b', str(text).lower())
    misspelled = spell.unknown(words)
    return len(misspelled)

def run_analysis(df, title=""):
    print(f"\n=========== {title} ===========")

    analysis = pd.DataFrame()
    analysis["text"] = df["text"]

    # Compute metrics
    analysis["punctuation_count"] = analysis["text"].apply(count_punctuation)
    analysis["emoji_count"]       = analysis["text"].apply(lambda x: len(extract_emojis(x)))
    analysis["misspelling_count"] = analysis["text"].apply(count_misspellings)

    # Summary outputs
    print("\nPunctuation Statistics:")
    print(analysis["punctuation_count"].describe())

    print("\nEmoji Statistics:")
    print(analysis["emoji_count"].describe())

    print("\nMisspelling Statistics:")
    print(analysis["misspelling_count"].describe())

    # Emoji frequency
    all_emojis = []
    analysis["text"].apply(lambda x: all_emojis.extend(extract_emojis(x)))
    emoji_counts = Counter(all_emojis)

    print("\nMost Common Emojis:")
    print(emoji_counts.most_common(10))

    return analysis, emoji_counts
true_analysis, true_emoji_counts = run_analysis(true_df, "TRUE NEWS ANALYSIS")
fake_analysis, fake_emoji_counts = run_analysis(fake_df, "FAKE NEWS ANALYSIS")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
sns.set(style="whitegrid")

import pandas as pd


#Emoji
true_emoji_df = pd.DataFrame(true_emoji_counts.most_common(15), columns=["emoji", "true_freq"])
fake_emoji_df = pd.DataFrame(fake_emoji_counts.most_common(15), columns=["emoji", "fake_freq"])


emoji_compare = pd.merge(true_emoji_df, fake_emoji_df, on="emoji", how="outer").fillna(0)


emoji_melted = emoji_compare.melt(id_vars="emoji",
                                  value_vars=["true_freq","fake_freq"],
                                  var_name="dataset", value_name="frequency")

if emoji_melted.empty:
    print("No emojis found in either dataset.")
else:
    sns.barplot(data=emoji_melted, x="frequency", y="emoji", hue="dataset")
    plt.title("Emoji Frequency: True vs Fake")
    plt.show()


In [None]:
# Average misspellings bar chart
true_avg = true_analysis["misspelling_count"].mean()
fake_avg = fake_analysis["misspelling_count"].mean()

plt.figure(figsize=(6,4))
plt.bar(["True News", "Fake News"], [true_avg, fake_avg], color=["blue", "orange"])
plt.title("Average Misspellings: True vs Fake")
plt.ylabel("Average Misspelling Count")
plt.show()

print("True Avg Misspellings:", true_avg)
print("Fake Avg Misspellings:", fake_avg)




In [None]:
#punctuation
true_punc_avg = true_analysis["punctuation_count"].mean()
fake_punc_avg = fake_analysis["punctuation_count"].mean()

plt.figure(figsize=(6,4))
plt.bar(["True News", "Fake News"], [true_punc_avg, fake_punc_avg], color=["blue", "orange"])
plt.title("Average Punctuation Count: True vs Fake")
plt.ylabel("Average Punctuation Count")
plt.show()

print("True Avg Punctuation:", true_punc_avg)
print("Fake Avg Punctuation:", fake_punc_avg)



print("TRUE Punctuation Count")
display(true_analysis.sort_values("punctuation_count", ascending=False).head(10))

print("FAKE Punctuation Count")
display(fake_analysis.sort_values("punctuation_count", ascending=False).head(10))

## Inspecting the Data Set (Adriena Jiang)

In [None]:
# Inspecting shape and columns for true data set
print("True shape:", true_df.shape)
print("True Columns: ", true_df.columns)
# Inspecting the shape and column for fake data set
print("Fake shape:", fake_df.shape)
print("Fake Columns: ", fake_df.columns)

In [None]:
# Take a look at the true data set
true_df.head()

In [None]:
# Take a look at the fake data set
fake_df.head()

## Quality Issue Analysis (Harshika)

**What I found:** Fake news articles are noisier, there have blanks (2.7%), short texts (1.0%), and many special character spikes. Sentence lengths also vary more, with many outliers in Fake news.

**Why this matters:** The model can learn to classify based on noise or structure (e.g. formatting, length), not actual content which leads to overfitting or data leakage.

**What I suggest:** Remove blank/short texts, cap extreme special chars, and normalize formatting to ensure content-focused learning.

In [None]:
#Harshika Text quality issue analysis
import re

#count special chars like @, #, !
def count_chars(text):
    if pd.isna(text) or text.strip() == "":
        return 0
    return len(re.findall(r'[^a-zA-Z0-9\s]', text))

#count sentences split by .
def count_sents(text):
    if pd.isna(text) or text.strip() == "":
        return 0
    sents = [s for s in text.split('.') if s.strip()]
    return len(sents)

#avg words per sentence
def sent_len(text):
    if pd.isna(text) or text.strip() == "":
        return 0
    words = len(text.split())
    sents = count_sents(text)
    return words / sents if sents > 0 else 0

#word count
def word_cnt(text):
    if pd.isna(text) or text.strip() == "":
        return 0
    return len(text.split())

#add columns to true data
true_df['chars'] = true_df['text'].apply(count_chars)
true_df['sent_len'] = true_df['text'].apply(sent_len)
true_df['words'] = true_df['text'].apply(word_cnt)
true_df['blanks'] = true_df['text'].apply(lambda x: pd.isna(x) or x.strip() == "")
true_df['short'] = true_df['words'].apply(lambda x: x < 10 and x > 0)

#add columns to fake data
fake_df['chars'] = fake_df['text'].apply(count_chars)
fake_df['sent_len'] = fake_df['text'].apply(sent_len)
fake_df['words'] = fake_df['text'].apply(word_cnt)
fake_df['blanks'] = fake_df['text'].apply(lambda x: pd.isna(x) or x.strip() == "")
fake_df['short'] = fake_df['words'].apply(lambda x: x < 10 and x > 0)

#count missing NaN
true_nan_text = true_df['text'].isna().sum()
true_nan_title = true_df['title'].isna().sum()
fake_nan_text = fake_df['text'].isna().sum()
fake_nan_title = fake_df['title'].isna().sum()

print("\n True News Quality ---")
print(f"Missing Text: {true_nan_text} ({100 * true_nan_text / len(true_df):.1f}%)")
print(f"Missing Title: {true_nan_title} ({100 * true_nan_title / len(true_df):.1f}%)")
print(f"Blank Text: {true_df['blanks'].sum()} ({100 * true_df['blanks'].sum() / len(true_df):.1f}%)")
print(f"Short Text (<10 words): {true_df['short'].sum()} ({100 * true_df['short'].sum() / len(true_df):.1f}%)")
print(f"Avg Special Chars: {true_df['chars'].mean():.1f}")
print(f"Avg Sentence Length: {true_df['sent_len'].mean():.1f} words")

print("\n Fake News Quality ---")
print(f"Missing Text: {fake_nan_text} ({100 * fake_nan_text / len(fake_df):.1f}%)")
print(f"Missing Title: {fake_nan_title} ({100 * fake_nan_title / len(fake_df):.1f}%)")
print(f"Blank Text: {fake_df['blanks'].sum()} ({100 * fake_df['blanks'].sum() / len(fake_df):.1f}%)")
print(f"Short Text (<10 words): {fake_df['short'].sum()} ({100 * fake_df['short'].sum() / len(fake_df):.1f}%)")
print(f"Avg Special Chars: {fake_df['chars'].mean():.1f}")
print(f"Avg Sentence Length: {fake_df['sent_len'].mean():.1f} words")

#problem articles
print("\n True: Top 5 Noisy Articles ---")
print(true_df.nlargest(5, 'chars')[['text', 'chars', 'blanks', 'short']])

print("\n Fake: Top 5 Noisy Articles ---")
print(fake_df.nlargest(5, 'chars')[['text', 'chars', 'blanks', 'short']])

#special chars plot
plt.figure(figsize=(8,4))
sns.histplot(true_df['chars'], bins=15, color='blue', label='True', alpha=0.5)
sns.histplot(fake_df['chars'], bins=15, color='red', label='Fake', alpha=0.5)
plt.title("Special Characters in Text")
plt.xlabel("Special Chars")
plt.ylabel("Articles")
plt.legend()
plt.show()

#sentence length plot
plt.figure(figsize=(6,4))
sns.boxplot(data=[true_df['sent_len'], fake_df['sent_len']], palette=['blue', 'red'])
plt.xticks([0,1], ['True', 'Fake'])
plt.title("Words per Sentence")
plt.ylabel("Words")
plt.show()

## Word Length Analysis (Ousman Bah)

In [None]:
fake_df['word_count'] = fake_df['text'].str.split().str.len()
fake_df['char_count'] = fake_df['text'].str.len()
true_df['word_count'] = true_df['text'].str.split().str.len()
true_df['char_count'] = true_df['text'].str.len()
print("FAKE :",fake_df[['word_count', 'char_count']].describe())
print(  )
print("TRUE :",true_df[['word_count', 'char_count']].describe())

## Blank Text (Ousman Bah)

**What I found:**
Only one article from the True News dataset was blank, while about 630 articles from the Fake News dataset contained no text.

In [None]:
# --- OUSMAN BAH: FULL BLANK ARTICLES PRINT & VISUALIZATION ---

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your datasets already exist: fake_df and true_df

# --- Count Blank vs Non-Blank Articles ---
blank_fake = fake_df['text'].str.strip().eq("").sum()
non_blank_fake = len(fake_df) - blank_fake

blank_true = true_df['text'].str.strip().eq("").sum()
non_blank_true = len(true_df) - blank_true

# --- Print Counts ---
print("\nBlank Fake Articles:", blank_fake)
print("Non-Blank Fake Articles:", non_blank_fake)
print("Blank True Articles:", blank_true)
print("Non-Blank True Articles:", non_blank_true)

# --- Visualization ---
plt.figure(figsize=(8, 5))
sns.barplot(
    x=["Fake Blank", "Fake Non-Blank", "True Blank", "True Non-Blank"],
    y=[blank_fake, non_blank_fake, blank_true, non_blank_true],
    palette=["red", "red", "blue", "blue"]
)
plt.title("Blank vs Non-Blank Articles in Fake and True Datasets")
plt.ylabel("Number of Articles")
plt.tight_layout()
plt.show()


## Articles Ending with '...' (Ousman Bah)

What I found: Only two articles from the True Dataset ended with "...."

In [None]:
fake_trunc = fake_df[fake_df['text'].str.contains(r"\.\.\.\s*$", na=False)]
true_trunc = true_df[true_df['text'].str.contains(r"\.\.\.\s*$", na=False)]

print("Fake Articles ending with ellipses:", len(fake_trunc))
print()
print("--------------------------------")
print()
print("True Articles ending with ellipses:", len(true_trunc))

## Articles Length Visualization: Line Graph (Ousman Bah)
**WHAT I FOUND:**

The plot compares the word counts of all articles in both datasets:

**The red line** (Fake Articles) has more sharp spikes, showing high variation in article length — some are extremely long.

**The blue line** (True Articles) is denser and flatter, indicating more uniform article lengths.

**Overall, the Fake dataset shows greater inconsistency and outliers, while the True dataset maintains a more stable pattern.**

In [None]:
print("Long Fake Articles (>2000 words):",
      len(fake_df[fake_df['word_count'] > 2000]))

print(fake_df[fake_df['word_count'] > 2000]['text'].head(20000))

print()
print("--------------------------------")
print()

print("Long True Articles (>2000 words):",
      len(true_df[true_df['word_count'] > 2000]))

print(true_df[true_df['word_count'] > 2000]['text'].head(20000))


import matplotlib.pyplot as plt
import seaborn as sns

# Compute word counts
fake_df['word_count'] = fake_df['text'].str.split().str.len()
true_df['word_count'] = true_df['text'].str.split().str.len()

# Find the longest article length across both datasets
max_len = max(fake_df['word_count'].max(), true_df['word_count'].max())

fake_df = fake_df.reset_index(drop=True)
true_df = true_df.reset_index(drop=True)

plt.figure(figsize=(14,6))

# Plot line for Fake
plt.plot(fake_df.index, fake_df['word_count'],
         color='red', alpha=0.6, label='Fake Articles')

# Plot line for True
plt.plot(true_df.index, true_df['word_count'],
         color='blue', alpha=0.6, label='True Articles')

# Labels and formatting
plt.title(f"Article Lengths (0–{max_len} words)")
plt.xlabel("Article Index")
plt.ylabel("Word Count (words)")
plt.ylim(0, max_len)
plt.legend()
plt.tight_layout()
plt.show()

## Missing Value Analysis (Kashvi Vijay)

In [None]:
#KASHVI VIJAY - MISSING VAL ANALYSIS
# check if the values in any features are null
nan_count_true = true_df.isnull().sum()
nan_count_fake = fake_df.isnull().sum()
print("True NaN Count:\n", nan_count_true)
print("\n")
print("Fake NaN Count:\n", nan_count_fake)

## Unique Words (Kashvi Vijay)



In [None]:
# Added a column to each row for all the unique words in text
true_df['unique_words'] = true_df['text'].apply(lambda x: set(re.findall(r"\b[\w']+\b", str(x).lower())))

# Add a column with the set of unique words for each row in fake_df
fake_df['unique_words'] = fake_df['text'].apply(lambda x: set(re.findall(r"\b[\w']+\b", str(x).lower())))
true_df.head()

## English Detection (Kashvi Vijay)

In [None]:
def is_english(word_set, threshold=0.75, min_freq=1.5):
    """
    Returns True if enough words in word_set are common in English.

    threshold = proportion of words that must be English
    min_freq  = minimum Zipf frequency for a word to count as English
    """
    if not word_set:
        return False

    english_word_count = sum(
        1 for word in word_set if zipf_frequency(word, 'en') > min_freq
    )
    return (english_word_count / len(word_set)) >= threshold


# Apply in parallel
with Pool() as pool:
    true_df['is_english'] = pool.map(is_english, true_df['unique_words'])
    fake_df['is_english'] = pool.map(is_english, fake_df['unique_words'])


### Visualization: English vs Non-English (Adriena Jiang)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5), sharey=True)

# True articles
true_df['is_english'].value_counts().plot(kind="bar", ax=ax[0], color=["skyblue", "lightgray"])
ax[0].set_title("True Articles - English vs Non-English")
ax[0].set_xticklabels(["English","Not English"], rotation=0)
ax[0].set_ylabel("Count")

# Fake articles
fake_df['is_english'].value_counts().plot(kind="bar", ax=ax[1], color=["salmon", "lightgray"])
ax[1].set_title("Fake Articles - English vs Non-English")
ax[1].set_xticklabels(["English","Not English"], rotation=0)

plt.suptitle("Language Distribution by Dataset")
plt.show()

## Sentiment Analysis (Kashvi Vijay)

In [None]:

# performed sentiment analysis on a smaller sample of 80 to assess whether
# there is a relationship between the sentiment and the veracity of an article
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)

true_sample = true_df['text'].head(20)
fake_sample = fake_df['text'].head(20)

with Pool() as pool:
    true_sentiments = pool.map(analyze_sentiment, true_sample)
    fake_sentiments = pool.map(analyze_sentiment, fake_sample)

true_sample_sentiment = true_sample.to_frame()
true_sample_sentiment['compound_sentiment'] = [s['compound'] for s in true_sentiments]

fake_sample_sentiment = fake_sample.to_frame()
fake_sample_sentiment['compound_sentiment'] = [s['compound'] for s in fake_sentiments]

print("True sample with sentiment:")
display(true_sample_sentiment)

print("\nFake sample with sentiment:")
display(fake_sample_sentiment)
# close to -1 is negative, close to 1 is positive, and close to 0 is neutral


### Visualization: Distribution of Sentiment Scores (Adriena Jiang)

In [None]:
plt.figure(figsize=(8,5))
plt.hist(true_sample_sentiment['compound_sentiment'], bins=10, alpha=0.6, label='True', color='skyblue')
plt.hist(fake_sample_sentiment['compound_sentiment'], bins=10, alpha=0.6, label='Fake', color='salmon')
plt.xlabel("Compound Sentiment Score (-1 = Neg, 0 = Neutral, 1 = Pos)")
plt.ylabel("Frequency")
plt.title("Distribution of Sentiment Scores (True vs Fake, Sample)")
plt.legend()
plt.show()

### Visualization: Distribution/boxplot for Compound Sentiment Scores (Kashvi Vijay)

In [None]:
#KASHVI - visualization of sentiment analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Histogram of compound sentiment for true sample
plt.figure(figsize=(8, 5))
sns.histplot(true_sample_sentiment['compound_sentiment'], bins=20, kde=True, color='blue')
plt.title('Distribution of Compound Sentiment for True News Sample')
plt.xlabel('Compound Sentiment Score')
plt.ylabel('Frequency')
plt.show()

# Histogram of compound sentiment for fake sample
plt.figure(figsize=(8, 5))
sns.histplot(fake_sample_sentiment['compound_sentiment'], bins=20, kde=True, color='red')
plt.title('Distribution of Compound Sentiment for Fake News Sample')
plt.xlabel('Compound Sentiment Score')
plt.ylabel('Frequency')
plt.show()

combined_sentiment = pd.concat([
    true_sample_sentiment[['compound_sentiment']].assign(News_Type='True Sample'),
    fake_sample_sentiment[['compound_sentiment']].assign(News_Type='Fake Sample')
])

plt.figure(figsize=(8, 5))
sns.boxplot(x='News_Type', y='compound_sentiment', data=combined_sentiment, palette=['blue', 'red'])
plt.title('Box Plot of Compound Sentiment for True and Fake News Samples')
plt.xlabel('News Type')
plt.ylabel('Compound Sentiment Score')
plt.show()

**MORE SENTIMENT ANALYSIS (OUSMAN BAH)**

TO VERIFY IF FAKE AND TRUE ARTICLES ARE BIASED TOWARDS A SPECIFIC SUBJECT

CONCLUSION: FAKE TEXT CONTAINS ALOT OF DONALD AND EVEN WHICH IS NOT AS FREQUENT IN THE TRUE TEXT

In [None]:


from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt

# --- Step 1: Vectorize Text (get top words) ---
vectorizer = CountVectorizer(max_features=20)  # Top 20 frequent words
true_counts = vectorizer.fit_transform(true_df['text'])
true_freq = pd.DataFrame(true_counts.toarray(), columns=vectorizer.get_feature_names_out()).sum()

fake_counts = vectorizer.fit_transform(fake_df['text'])
fake_freq = pd.DataFrame(fake_counts.toarray(), columns=vectorizer.get_feature_names_out()).sum()

# --- Step 2: Combine for Comparison ---
freq_df = pd.DataFrame({
    'True News': true_freq,
    'Fake News': fake_freq
}).fillna(0)

# --- Step 3: Plot Side-by-Side Bar Chart ---
plt.figure(figsize=(12,6))
freq_df.head(15).plot(kind='bar', width=0.8)
plt.title('Top Word Frequency Comparison: True vs Fake News')
plt.xlabel('Top Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Subject Finding Analysis (Lin Zhang)

**What I found:**
There is zero overlap between true-news subjects and fake-news subjects, except for politics which still impacts most of the articles

**Why this matters:** model cheating, model can learn the pattern recognition and not actually doing the content learning. data leakage if we dont do anything to it.

**What I suggest**: Remove suject column and perform content-only training.



In [None]:
# Subject findings
print(f"True News: {len(true_df):,} articles")
print(f"Fake News: {len(fake_df):,} articles")
print("true_subjects = true_df['subject'].value_counts()")
print("fake_subjects = fake_df['subject'].value_counts()")

true_df_cleaned = true_df
fake_df_cleaned = fake_df
true_subjects = true_df['subject'].value_counts()
fake_subjects = fake_df['subject'].value_counts()

print(f"\nTrue News Subject")
for subject, count in true_subjects.items():
  percentage = (count/len(true_df))*100
  print(f" {subject}: {count:,} articles ({percentage:.1f}%)")
print(f"\nFake News Subject")
for subject, count in fake_subjects.items():
  percentage = (count/len(fake_df))*100
  print(f" {subject}: {count:,} articles ({percentage:.1f}%)")


true_unique_subjects = true_df['subject'].unique()
fake_unique_subjects = fake_df['subject'].unique()

true_subject_set = set(true_subjects.index)
fake_subject_set = set(fake_subjects.index)

common_subject = true_subject_set.intersection(fake_subject_set)
true_only_subject = true_subject_set-fake_subject_set
fake_only_subject = fake_subject_set-true_subject_set

#graph:
fig, ax = plt.subplots(figsize=(12, 8))

# Get all unique subjects
subjects = list(true_subjects.index) + list(fake_subjects.index)
true_counts = [true_subjects.get(s, 0) for s in subjects]
fake_counts = [fake_subjects.get(s, 0) for s in subjects]

#bar chart
x = np.arange(len(subjects))
width = 0.35

bars1 = ax.bar(x - width/2, true_counts, width, label='True News', color='#2E8B57', alpha=0.8)
bars2 = ax.bar(x + width/2, fake_counts, width, label='Fake News', color='#DC143C', alpha=0.8)

ax.set_xlabel('Subject Categories')
ax.set_ylabel('Number of Articles')
ax.set_title('Subject Distribution: True vs Fake News')
ax.set_xticks(x)
ax.set_xticklabels(subjects, rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)

for bar in bars1:
    height = bar.get_height()
    if height > 0:
        ax.text(bar.get_x() + bar.get_width()/2., height + 100,
                f'{int(height):,}', ha='center', va='bottom', fontsize=9)

for bar in bars2:
    height = bar.get_height()
    if height > 0:
        ax.text(bar.get_x() + bar.get_width()/2., height + 100,
                f'{int(height):,}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("True:", len(true_df_cleaned))
print("Fake:", len(fake_df_cleaned))




## Duplication Analysis (Lin Zhang)

**What is this about:** This analysis is to go over the duplication articles in both true and fake news dataset.

**What I found:** There is total 21,417 articles in true news, and 23,481 articles in fake news.
After inspecting the duplications, I found theres 427 articles in true news are duplicated, and 10,671 articles in fake news are duplicated.

**Why this matters:** It matters because duplicated articles does not add any value to our model. There's no benefit for keeping fuplicated articles

**What I suggest**: I suggest we drop all the duplicated articles to keep our dataset with only unique articles. After duplication we will ending with 21,197 true articles, 17,908 fake articles. In ratio it is 1.18:1(true:fake) which will not effect much on later trainings.

In [None]:
true_duplicates = true_df_cleaned.duplicated(subset=['title', 'text'], keep=False)
fake_duplicates = fake_df_cleaned.duplicated(subset=['title', 'text'], keep=False)

true_duplicate_count = true_duplicates.sum()
fake_duplicate_count = fake_duplicates.sum()

true_original = len(true_df)
fake_original = len(fake_df)

# remove duplicates from cleaned dataframe
true_df_cleaned.drop_duplicates(subset=['title', 'text'], keep='first', inplace=True)
fake_df_cleaned.drop_duplicates(subset=['title', 'text'], keep='first', inplace=True)

true_after = len(true_df_cleaned)
fake_after = len(fake_df_cleaned)



# graph
fig, ax = plt.subplots(1, 1, figsize=(12,8))
fig.suptitle('Duplicate Removal - Before vs After', fontsize=16, fontweight='bold')

datasets = ['True News', 'Fake News']
before_counts = [true_original, fake_original]
after_counts = [true_after, fake_after]

x = np.arange(len(datasets))
width = 0.35
bars1 = ax.bar(x - width/2, before_counts, width, label='Before Duplicate Removal', color='grey', alpha=0.8)
bars2 = ax.bar(x + width/2, after_counts, width, label='After Duplicate Removal', color='green', alpha=0.8)


ax.set_xlabel('Dataset')
ax.set_ylabel('Number of Articles')
ax.set_title('Article Count: Before vs After Duplicate Removal')
ax.set_xticks(x)
ax.set_xticklabels(datasets)
ax.legend()
ax.grid(True, alpha=0.3)

# load data in graph
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 100,
                f'{int(height):,}', ha='center', va='bottom', fontweight='bold')

ax.text(0, before_counts[0] + 500, f'Duplicates: {true_duplicate_count:,}',
        ha='center', va='bottom', fontweight='bold', color='red')
ax.text(1, before_counts[1] + 500, f'Duplicates: {fake_duplicate_count:,}',
        ha='center', va='bottom', fontweight='bold', color='red')

plt.tight_layout()
plt.show()

print(f"BEFORE duplicate removal:")
print(f"  True News: {true_original:,} articles")
print(f"  Fake News: {fake_original:,} articles")
print(f"  Ratio: {true_original/fake_original:.3f}:1 (True:Fake)")

print(f"\nAFTER duplicate removal:")
print(f"  True News: {true_after:,} articles")
print(f"  Fake News: {fake_after:,} articles")
print(f"  Ratio: {true_after/fake_after:.3f}:1 (True:Fake)")

print(f"\nDUPLICATES REMOVED:")
print(f"  True News: {true_duplicate_count:,} duplicates ({(true_duplicate_count/true_original*100):.2f}%)")
print(f"  Fake News: {fake_duplicate_count:,} duplicates ({(fake_duplicate_count/fake_original*100):.2f}%)")

print("Original files:")
print(f"True.csv: {len(true_df):,} articles")
print(f"Fake.csv: {len(fake_df):,} articles")




## Date Analysis (Sanskriti Khadka)

**Date Standardization / Analysis:**

In this part of the analysis, I standardized the article dates and checked for invalid or missing values, which were very rare. The visualizations showed strong temporal clustering, with true news articles concentrated in late 2017 while fake news was more spread out. Because this could create bias, I suggest removing dates from the modeling step.

In [None]:
# Date format standardization / Analysis

# Date Standardization
true_df['date_clean'] = pd.to_datetime(true_df['date'], format='mixed', errors='coerce')
fake_df['date_clean'] = pd.to_datetime(fake_df['date'], format='mixed', errors='coerce')

print("Date standardization complete.")


print(f"True articles: {true_df['date_clean'].notna().sum()} valid dates")
print(f"Fake articles: {fake_df['date_clean'].notna().sum()} valid dates")

print(f"True invalid dates: {true_df['date_clean'].isna().sum()}")
print(f"Fake invalid dates: {fake_df['date_clean'].isna().sum()}")

# Valid vs Invalid Dates
valid_counts = [
    true_df['date_clean'].notna().sum(),
    fake_df['date_clean'].notna().sum()
]
invalid_counts = [
    true_df['date_clean'].isna().sum(),
    fake_df['date_clean'].isna().sum()
]

plt.figure(figsize=(6,4))
bars_valid = sns.barplot(
    x=['True News', 'Fake News'],
    y=valid_counts,
    color='green',
    label='Valid Dates'
)
bars_invalid = sns.barplot(
    x=['True News', 'Fake News'],
    y=invalid_counts,
    color='red',
    bottom=valid_counts,
    label='Invalid Dates'
)

# Timeline Distribution
plt.figure(figsize=(12,5))
sns.histplot(true_df['date_clean'].dropna(), bins=50, color='blue', label='True News', alpha=0.6)
sns.histplot(fake_df['date_clean'].dropna(), bins=50, color='red', label='Fake News', alpha=0.6)
plt.title("Timeline Distribution of Articles")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.legend()
plt.show()

# Monthly Trends
true_month = true_df['date_clean'].dt.to_period("M").value_counts().sort_index()
fake_month = fake_df['date_clean'].dt.to_period("M").value_counts().sort_index()

plt.figure(figsize=(12,6))
plt.plot(true_month.index.astype(str), true_month.values, label='True News', color='blue')
plt.plot(fake_month.index.astype(str), fake_month.values, label='Fake News', color='red')
plt.xticks(rotation=90)
plt.title("Monthly Article Counts")
plt.xlabel("Month")
plt.ylabel("Number of Articles")
plt.legend()
plt.show()

## Source Analysis (Sankriti Khadka)

**Source Analysis:**

In this part of the analysis, I looked at which sources were mentioned in the articles. Real news mostly cited professional outlets like Reuters and Independent, while fake news relied more on social media platforms like Twitter, YouTube, and Facebook. This shows clear differences in sourcing patterns, but I suggest removing the top sources from the dataset since they could create bias and make the model overfit to specific names instead of content.

In [None]:
#Sanskriti Khadka - Source Analysis

from collections import Counter

def find_sources(text):
    if pd.isna(text):
        return []

    sources = [
        'CNN', 'Fox News', 'MSNBC', 'NBC', 'ABC', 'CBS', 'PBS',
        'New York Times', 'Washington Post', 'Wall Street Journal', 'USA Today', 'Los Angeles Times', 'Chicago Tribune',
        'Reuters', 'Associated Press', 'AP', 'Bloomberg', 'UPI',
        'BBC', 'Guardian', 'Telegraph', 'Independent', 'Sky News',
        'Politico', 'Huffington Post', 'BuzzFeed', 'Vox', 'Axios', 'Daily Beast', 'Slate', 'Salon', 'The Hill',
        'Breitbart', 'Daily Wire', 'National Review', 'Weekly Standard', 'Washington Examiner', 'New York Post',
        'The Nation', 'Mother Jones', 'Daily Kos', 'ThinkProgress',
        'Daily Mail', 'Sun', 'Mirror', 'Express',
        'Twitter', 'Facebook', 'Instagram', 'YouTube', 'TikTok',
        'Pentagon', 'FBI', 'CIA', 'State Department', 'Justice Department', 'Department of Defense'
    ]

    found = []
    text_str = str(text)

    for source in sources:
        pattern = r'\b' + re.escape(source) + r'\b'
        if re.search(pattern, text_str, re.IGNORECASE):
            found.append(source)

    return found

print("ANALYZING SAMPLE OF DATASET...")

# random sample
fake_sample = fake_df.sample(n=3000, random_state=1234)
true_sample = true_df.sample(n=3000, random_state=1234)

fake_sources = []
for text in fake_sample['text']:
    fake_sources.extend(find_sources(text))

real_sources = []
for text in true_sample['text']:
    real_sources.extend(find_sources(text))

print(f"\nTOP 10 SOURCES IN FAKE articles (sample):")
fake_counts = Counter(fake_sources)
for source, count in fake_counts.most_common(10):
    print(f"{source}: {count}")

print(f"\nTOP 10 SOURCES IN REAL articles (sample):")
real_counts = Counter(real_sources)
for source, count in real_counts.most_common(10):
    print(f"{source}: {count}")

# Data Visulization
fake_top10 = pd.DataFrame(fake_counts.most_common(10), columns=["Source", "Count"])
real_top10 = pd.DataFrame(real_counts.most_common(10), columns=["Source", "Count"])

# Fake News Sources
plt.figure(figsize=(10,5))
sns.barplot(x="Count", y="Source", data=fake_top10, hue="Source", dodge=False, legend=False, palette="Reds_r")
plt.title("Top 10 Sources in Fake News (Sample)")
plt.xlabel("Mentions")
plt.ylabel("Source")
plt.show()

# Real News Sources
plt.figure(figsize=(10,5))
sns.barplot(x="Count", y="Source", data=real_top10, hue="Source", dodge=False, legend=False, palette="Blues_r")
plt.title("Top 10 Sources in Real News (Sample)")
plt.xlabel("Mentions")
plt.ylabel("Source")
plt.show()

## URL Analysis (Nancy Huang)

**URL/Link Analysis Conclusion**

Urls are unique to the *FAKE NEWS* dataset.
t.co showed up 2,055 times (shortened version of twitter.com, now known as X), this suggests that many fake news articles pulled in tweets directly, quoted them, or used Twitter (X) as a primary "source." The pattern that surfaces is the presence of social media links strongly correlates with fake news. *TRUE NEWS* has no Urls.

**My suggestion/approach:** Replace all Urls with a generic <URL> token when cleaning, we should not leave any raw Urls because this risks overfitting or we can add a numeric value to capture the difference.

In [None]:
fake_df['domains'] = fake_df['text'].str.extract(r'://(?:www\.)?([^/\s]+)')
print("Highest links (domains) in Fake News dataset: ")
print(fake_df['domains'].value_counts().head(20))

In [None]:
true_df['domains'] = true_df['text'].str.extract(r'://(?:www\.)?([^/\s]+)')
print("Top links (domains) in Fake News dataset: ")
print(true_df['domains'].value_counts().head(20))

## Source Analysis with Focus on Twitter (Adriena Jiang)

**Findings:**

Both real and fake news have an abundance of news that comes from twitter. Twitter news is more likely fake than real.

**Meaning:**

It may matter for model cheating. It may learn a pattern and lean torwards false. Potential data leakage.

**Suggestions**:

Even though it skews torwards false, we should keep twitter mentions unlike Reuter because there is still a good portion of twitter news that lies in the true news data set.

In [None]:
# Further analysis on twitter occurences.

# Look for the number of articles that mention 'twitter'
real_twitter = true_df['text'].str.contains(r'twitter', case=False, na=False).sum()
fake_twitter = fake_df['text'].str.contains(r'twitter', case=False, na=False).sum()
print(f"Twitter in Real News: {real_twitter}")
print(f"Twitter in Fake News: {fake_twitter}")

# Plot the statistic above for visualization
plt.bar(["Real News", "Fake News"], [real_twitter, fake_twitter], color=["blue","red"])
plt.title("Articles Mentioning Twitter")
plt.ylabel("Number of Articles")
plt.show()

# Percentage of each dataset contains the word "twitter"
real_share = real_twitter / len(true_df) * 100
fake_share = fake_twitter / len(fake_df) * 100
print(f"Real News mentioning Twitter: {real_share:.2f}%")
print(f"Fake News mentioning Twitter: {fake_share:.2f}%")

# Proportion of mentions between Real vs Fake
plt.pie([real_share, fake_share], labels=["Real News","Fake News"],
        autopct="%1.1f%%", colors=["blue","red"], startangle=90)
plt.title("Share of Articles Mentioning Twitter")
plt.show()

# Look at some of the articles that mention twitter.
print("\nReal News Examples:")
for text in true_df[true_df['text'].str.contains("twitter", case=False, na=False)]['text'].sample(5, random_state=42):
    print("\n--- Article ---")
    print(text)
print("\nFake News Examples:")
for text in fake_df[fake_df['text'].str.contains("twitter", case=False, na=False)]['text'].sample(5, random_state=42):
    print("\n--- Article ---")
    print(text)

## Author Bylines Analysis (Nancy Huang)

**Author Byline Takeaways**

The number of bylines for both Fake and True news are very low. *TRUE NEWS*  has 0 bylines, likely because many true news datasets are cleaned to keep only the articles body so theres consistency. Whereas, *FAKE NEWS* datasets are often taken from websites or social media posts.

**My sugessted solution:** We should remove bylines from all the fake news articles so our model can learn from the actual content instead of the inconsistencies in our dataset.

In [None]:
import re
fake_df['byline'] = fake_df['text'].str.extract(r'^\s*By\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)')
true_df['byline'] = true_df['text'].str.extract(r'^\s*By\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)')
print("Fake with byline:", fake_df['byline'].notna().sum(), "out of", len(fake_df))
print("True with byline:", true_df['byline'].notna().sum(), "out of", len(true_df))
print("Top fake bylines:")
print(fake_df['byline'].value_counts().head(20))
print("\nTop true bylines:")
print(true_df['byline'].value_counts().head(20))

## Stop Word Analysis (Sanskriti Khadka)

In this part of the analysis, I looked at stop words, common words like the, is, and, etc. Since these words don't add much meaning, I checked their frequencies and removed them to clean up the text. I also compared article lengths before and after removal to see how much noise was reduced. I suggest using the cleaned version for modeling since it highlights meaningful words, but keeping the original text for context and interpretation.

In [None]:
# Stop Word Anaylsis

import nltk
from nltk.corpus import stopwords
from collections import Counter

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def get_stopword_counts(texts):
    words = " ".join(texts.astype(str)).lower().split()
    stopword_list = [w for w in words if w in stop_words]
    return Counter(stopword_list)

# Count stopwords in True and Fake datasets
true_stop_counts = get_stopword_counts(true_df['text'])
fake_stop_counts = get_stopword_counts(fake_df['text'])

# Top 20 stop words for each
true_top = pd.DataFrame(true_stop_counts.most_common(20), columns=["Word","Count"])
fake_top = pd.DataFrame(fake_stop_counts.most_common(20), columns=["Word","Count"])

# Plot comparison
fig, axes = plt.subplots(1,2, figsize=(14,6), sharey=True)

sns.barplot(data=true_top, x="Count", y="Word", ax=axes[0], color="blue")
axes[0].set_title("Top Stopwords in True News")

sns.barplot(data=fake_top, x="Count", y="Word", ax=axes[1], color="red")
axes[1].set_title("Top Stopwords in Fake News")

plt.suptitle("Stop Word Frequency (Before Removal)")
plt.tight_layout()
plt.show()

# Data Cleaning

## Subject Removal Implementation (Lin Zhang)


In [None]:
#COPY DATA FOR VISUALIZATION BEFORE AND AFTER
true_before = true_df_cleaned["text"].copy()
fake_before = fake_df_cleaned["text"].copy()

In [None]:

print("BEFORE subject removal:")
print(f"True News: {len(true_df):,} articles, {len(true_df.columns)} columns")
print(f"Fake News: {len(fake_df):,} articles, {len(fake_df.columns)} columns")
print(f"Columns: {list(true_df.columns)}")


true_df_cleaned = true_df.drop('subject', axis=1)
fake_df_cleaned = fake_df.drop('subject', axis=1)

print("AFTER subject removal:")
print(f"True News: {len(true_df_cleaned):,} articles, {len(true_df_cleaned.columns)} columns")
print(f"Fake News: {len(fake_df_cleaned):,} articles, {len(fake_df_cleaned.columns)} columns")
print(f"Columns: {list(true_df_cleaned.columns)}")


true_df_cleaned.to_csv('True_cleaned.csv', index=False)
true_df_cleaned.to_csv('Fake_cleaned.csv', index=False)



**Subject Removal Completed.**

## Clean `Reuter` mentions in `True.csv` Data Set (Adriena Jiang)

In [None]:
REUTERS_PATTERN = r'\(?\s*reuters\s*\)?\s*(?:[-–—:])?'

def remove_reuters(df):
    # work on a copy and avoid chained-assignment warnings
    out = df.copy()
    out.loc[:, 'text'] = (
        out['text']
          .astype(str)
          # Remove Reuter Mentions
          .str.replace(REUTERS_PATTERN, ' ', regex=True, flags=re.I)
          # Collapse Spaces
          .str.replace(r'\s+', ' ', regex=True)
          .str.strip()
    )
    return out

# Remove Reuters
true_df_cleaned = remove_reuters(true_df_cleaned)
fake_df_cleaned = remove_reuters(fake_df_cleaned)

In [None]:
# Inspect data frame.


from collections import Counter

# sample for speed
fake_after_sample = fake_df_cleaned.sample(n=3000, random_state=1234)
true_after_sample = true_df_cleaned.sample(n=3000, random_state=1234)

fake_after_sources = []
for text in fake_after_sample['text']:
    fake_after_sources.extend(find_sources(text))

real_after_sources = []
for text in true_after_sample['text']:
    real_after_sources.extend(find_sources(text))

fake_after_counts = Counter(fake_after_sources)
real_after_counts = Counter(real_after_sources)

fake_after_top10 = pd.DataFrame(fake_after_counts.most_common(10), columns=["Source", "Count"])
real_after_top10 = pd.DataFrame(real_after_counts.most_common(10), columns=["Source", "Count"])

#Fake after clean
plt.figure(figsize=(10,5))
sns.barplot(x="Count", y="Source", data=fake_after_top10, hue="Source",
            dodge=False, legend=False, palette="Reds_r")
plt.title("Top 10 Sources in Fake News (AFTER Cleaning)")
plt.xlabel("Mentions")
plt.ylabel("Source")
plt.show()


#real afetr clean
plt.figure(figsize=(10,5))
sns.barplot(x="Count", y="Source", data=real_after_top10, hue="Source",
            dodge=False, legend=False, palette="Blues_r")
plt.title("Top 10 Sources in Real News (AFTER Cleaning)")
plt.xlabel("Mentions")
plt.ylabel("Source")
plt.show()


## URL Tokenization (Nancy Huang)

This block standardizes all URLs in the Fake and True dataset by replacing them with a generic "< URL >" token. The goal is to prevent downstream models from memorizing a specific domain/link.

What I did & Why:
1. I made a copy of the "text" column in the dataset to keep the original raw text.
2. PRE: This step is for detecting which row contains Urls before masking the datasets.
3. MASKING: This is when the Urls in the text are replaced with the generic URL token.
4. POST: This rechecks which rows contains the URL token after the masking process.
5. RESIDUALS: This checks that there are no raw Urls left in the datasets
6. Printed results

Outcome/Results: All Urls are normalized to "< URL > " which ensures consistent for modeling.

Results:

*FAKE NEWS DATASET*

**Pre:** 89.69% of rows contained at least one raw Url. This is very high which means the fake news article in our dataset almost always include links.

**Post:** After masking it is still 89.69% which makes sense because the < URL> token exists which means every Url got replaced, no Urls were lost, and no extra ones were created.

*TRUE NEWS DATASET*

**Pre:** 6.6% of roaws had a raw Url which means the true news articles rarely included links.

**Post:** After masking, it was still 6.6% which means the Url link were successfully replaced.

We know that the Post results are factual (that the links were successfuly replaced) due to the residual raw Url data. It shows that there are no leftover raw Urls in the text after masking.

In [None]:
import re
import numpy as np

# URL patterns
PROTO_URL = r'https?://[^\s)>\]}]+'
WWW_URL   = r'www\.[^\s)>\]}]+'
BARE_DOM  = r'\b(?:[a-z0-9-]+\.)+[a-z]{2,}(?:/[^\s)>\]}]+)?'
URL_REGEX = re.compile(f'(?:{PROTO_URL})|(?:{WWW_URL})|(?:{BARE_DOM})', flags=re.IGNORECASE)

ORG_WORDS = [
    r'reuters', r'associated press', r'\bap\b', r'bbc', r'guardian',
    r'fox news', r'cnn', r'msnbc', r'nbc', r'abc', r'cbs',
    r'new york times', r'washington post', r'wsj', r'usa today',
    r'breitbart', r'daily mail', r'daily wire', r'vox', r'axios',
    r'twitter', r'facebook', r'instagram', r'youtube', r'tiktok',
    r'pentagon', r'fbi', r'cia', r'state department'
]
ORG_REGEX = re.compile(r'\b(' + r'|'.join(ORG_WORDS) + r')\b', re.IGNORECASE)

def clean_text_for_bert(text):
    if not isinstance(text, str) or not text.strip():
        return ""

    text = URL_REGEX.sub(' ', text)

    text = ORG_REGEX.sub(' ', text)

    text = re.sub(r'image via.*?(?=\.|$)', '', text, flags=re.I)
    text = re.sub(r'featured image.*?(?=\.|$)', '', text, flags=re.I)
    text = re.sub(r'getty images', '', text, flags=re.I)
    text = re.sub(r'video screen capture', '', text, flags=re.I)
    text = re.sub(r'watch:?\s*video', '', text, flags=re.I)

    text = re.sub(r'^\s*[A-Z\s/,]{3,50}\s+[-–—]?\s*', '', text)

    text = re.sub(r'source:.*$', '', text, flags=re.I | re.M)
    text = re.sub(r'via:.*$', '', text, flags=re.I | re.M)

    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

print("Cleaning text...")
true_df_cleaned['text_clean'] = true_df_cleaned['text'].apply(clean_text_for_bert)
fake_df_cleaned['text_clean'] = fake_df_cleaned['text'].apply(clean_text_for_bert)

true_df_cleaned = true_df_cleaned[true_df_cleaned['text_clean'].str.len() > 100]
fake_df_cleaned = fake_df_cleaned[fake_df_cleaned['text_clean'].str.len() > 100]

print(f"After cleaning:")
print(f"True articles: {len(true_df_cleaned)}")
print(f"Fake articles: {len(fake_df_cleaned)}")

print("\nVerifying no URL tokens:")
print(f"True has '<URL>': {true_df_cleaned['text_clean'].str.contains('<URL>').sum()}")
print(f"Fake has '<URL>': {fake_df_cleaned['text_clean'].str.contains('<URL>').sum()}")

print("\nVerifying no ORG tokens:")
print(f"True has '<ORG>': {true_df_cleaned['text_clean'].str.contains('<ORG>').sum()}")
print(f"Fake has '<ORG>': {fake_df_cleaned['text_clean'].str.contains('<ORG>').sum()}")

# sample outputs to verify
print("\n" + "="*70)
print("SAMPLE CLEANED FAKE NEWS:")
print("="*70)
print(fake_df_cleaned['text_clean'].iloc[0][:500])

print("\n" + "="*70)
print("SAMPLE CLEANED TRUE NEWS:")
print("="*70)
print(true_df_cleaned['text_clean'].iloc[0][:500])



In [None]:
#VISULAIZE URL Before VS After

# BEFORE CLEANING (computed BEFORE applying clean_text_for_bert)
before_true_url = true_df_cleaned["text"].str.contains(URL_REGEX).sum()
before_fake_url = fake_df_cleaned["text"].str.contains(URL_REGEX).sum()

before_true_org = true_df_cleaned["text"].str.contains(ORG_REGEX).sum()
before_fake_org = fake_df_cleaned["text"].str.contains(ORG_REGEX).sum()

# AFTER CLEANING
after_true_url = true_df_cleaned['text_clean'].str.contains('<URL>').sum()
after_fake_url = fake_df_cleaned['text_clean'].str.contains('<URL>').sum()

after_true_org = true_df_cleaned['text_clean'].str.contains('<ORG>').sum()
after_fake_org = fake_df_cleaned['text_clean'].str.contains('<ORG>').sum()


compare_df = pd.DataFrame({
    "Token": ["URL", "URL", "ORG", "ORG"],
    "Dataset": ["True", "Fake", "True", "Fake"],
    "Before": [before_true_url, before_fake_url, before_true_org, before_fake_org],
    "After":  [after_true_url,  after_fake_url,  after_true_org,  after_fake_org]
})


fig, axes = plt.subplots(1,2, figsize=(14,6))

# BEFORE
sns.barplot(ax=axes[0], data=compare_df, x="Token", y="Before", hue="Dataset")
axes[0].set_title("BEFORE Cleaning: URL & ORG Tokens")
axes[0].set_ylabel("Count")

# AFTER
sns.barplot(ax=axes[1], data=compare_df, x="Token", y="After", hue="Dataset")
axes[1].set_title("AFTER Cleaning: URL & ORG Tokens")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.show()

## Dropping Blank Text Data (Ousman Bah)


***Fake News Dataset:***

**Pre Dropping:** 17,908 rows
**Post Dropping:** 17,462 rows

**Details:** A total of 446 rows were dropped because their text fields were empty or missing. These rows wouldn’t add value to analysis or model training, so removing them helped keep only meaningful articles.

***True News Dataset:***

**Pre Dropping:** 21,197 rows
**Post Dropping:** 21,196 rows

Details: Only 1 row was removed due to blank text. This shows the True dataset was already clean and consistent, requiring almost no data removal for text completeness.



In [None]:
#DROPPING BLANK TEXT DATA

print("True cleaned shape:", true_df_cleaned.shape)
print("Fake cleaned shape:", fake_df_cleaned.shape)

# --- Clean Fake dataset ---
# Count before
print("\nBefore dropping blanks, Fake rows:", len(fake_df))

fake_df_cleaned = fake_df_cleaned.dropna(subset=['text'])
fake_df_cleaned = fake_df_cleaned[fake_df_cleaned['text'].str.strip() != ""]

print("Fake_cleaned shape after drop:", fake_df_cleaned.shape)

# Count before
print("\nBefore dropping blanks, True rows:", len(true_df))

true_df_cleaned = true_df_cleaned.dropna(subset=['text'])
true_df_cleaned = true_df_cleaned[true_df_cleaned['text'].str.strip() != ""]


# Count after
print("True_cleaned shape after drop:", true_df_cleaned.shape)

true_before = len(true_df)
fake_before = len(fake_df)

true_after = len(true_df_cleaned)
fake_after = len(fake_df_cleaned)

true_before = int(true_before)
fake_before = int(fake_before)
true_after = int(true_after)
fake_after = int(fake_after)
drop_compare = pd.DataFrame({
    "Dataset": ["True Before", "True After", "Fake Before", "Fake After"],
    "Count": [true_before, true_after, fake_before, fake_after]
})

plt.figure(figsize=(8,5))
sns.barplot(data=drop_compare, x="Dataset", y="Count", palette="Blues")
plt.title("Before vs After Dropping Blank Text Data")
plt.ylabel("Number of Articles")
plt.xticks(rotation=25)
plt.show()

In [None]:
#Text with Stop Words Ousman Bah.

#Added a text with stop word analysis as advised by Abdul.
# Create a column to store the text with stopwords (before removing them)
for df in (true_df_cleaned, fake_df_cleaned):
    # Prefer the most original text available
    base = df['text_clean'] if 'text_clean' in df.columns else df['text']
    df['text_with_stopwords'] = base.astype(str)

## Stop Word Removal (Sanskriti Khadka)

In [None]:
import re, nltk
from nltk.corpus import stopwords

# load stops
try:
    _ = stopwords.words("english")
except LookupError:
    nltk.download("stopwords")

# base list
BASE = set(stopwords.words("english"))
# keep negations
KEEP = {"no","not","nor","n't","without","against"}
BASE = {w for w in BASE if w not in KEEP}
# filler words
FILL = {"said","says","say","mr","ms","mrs"}

# tokenizer
_tok = re.compile(r"[A-Za-z]+")

# remove stops
def rm_stops(s):
    toks = _tok.findall(str(s).lower())
    drop = BASE | FILL
    return " ".join(t for t in toks if t not in drop and len(t) > 1)

# add raw
for df in (true_df_cleaned, fake_df_cleaned):
    if "text_raw" not in df.columns:
        base = df["text_clean"] if "text_clean" in df.columns else df["text"]
        df["text_raw"] = base.astype(str)

# write to text
for df in (true_df_cleaned, fake_df_cleaned):
    df["text"] = df["text_raw"].apply(rm_stops)

# stats lines (your format)
true_before = true_df_cleaned['text_raw'].str.split().str.len().mean()
true_after  = true_df_cleaned['text'].str.split().str.len().mean()

fake_before = fake_df_cleaned['text_raw'].str.split().str.len().mean()
fake_after  = fake_df_cleaned['text'].str.split().str.len().mean()

print("True avg:", round(true_before,1), "→", round(true_after,1))
print("Fake avg:", round(fake_before,1), "→", round(fake_after,1))



import matplotlib.pyplot as plt

labels = ["True Before", "True After", "Fake Before", "Fake After"]
values = [true_before, true_after, fake_before, fake_after]

plt.figure(figsize=(8, 5))
plt.bar(labels, values)
plt.title("Average Word Count Before vs After Stopword Removal")
plt.ylabel("Average Word Count")
plt.show()

## Removal of Non-English Text (Kashvi Vijay)

In [None]:
# Removal of non-English text
# Filter out non-English rows
# true_df = true_df[true_df['is_english']]
# fake_df = fake_df[fake_df['is_english']]
true_df_cleaned = true_df_cleaned[true_df_cleaned["is_english"]]
fake_df_cleaned = fake_df_cleaned[fake_df_cleaned["is_english"]]
print("True articles remaining:", len(true_df_cleaned))
print("Fake articles remaining:", len(fake_df_cleaned))


## Date Standardization (Kashvi Vijay)

In [None]:
#Kashvi - final date standardization
# Convert to datetime objects first, coercing errors
true_df_cleaned['date_clean'] = pd.to_datetime(true_df['date'], format='mixed', errors='coerce')
fake_df_cleaned['date_clean'] = pd.to_datetime(fake_df['date'], format='mixed', errors='coerce')

# Convert valid datetime objects to 'MM-DD-YYYY' string format
# Keep original date string if conversion failed
true_df_cleaned['date'] = true_df['date_clean'].dt.strftime('%m-%d-%Y').fillna(true_df['date'])
fake_df_cleaned['date'] = fake_df['date_clean'].dt.strftime('%m-%d-%Y').fillna(fake_df['date'])

print("Date standardization to MM-DD-YYYY complete.")

# Feature Engineering

In [None]:
true_df_cleaned.head()

In [None]:
fake_df_cleaned.head()

## One Hot Encoding (Lin Zhang)
Why This Matters:
- One-hot encoding converts categorical data into binary features (0/1) that models can process
- Text length is a meaningful feature because fake news articles often have different lengths than real news

What It Means:
- Each article gets 4 binary columns: `text_length_category_short`, `text_length_category_medium`, `text_length_category_long`, `text_length_category_empty`
- Only ONE column is 1 (the category the article belongs to), others are 0

Why We Suggest This:
- Easy to understand which length category affects predictions
- No loss of information compared to label encoding
- Common practice for categorical ML features.

In [None]:
print(f"True dataset shape encoding: {true_df_cleaned.shape}")
print(f"Fake dataset shape encoding: {fake_df_cleaned.shape}")
def categorize_text_length(text):
    """Categorize text into length categories"""
    if pd.isna(text) or text.strip() == "":
        return "empty"
    word_count = len(str(text).split())
    if word_count < 100:
        return "short"
    elif word_count < 500:
        return "medium"
    else:
        return "long"

true_df_cleaned['text_length_category'] = true_df_cleaned['text'].apply(categorize_text_length)
fake_df_cleaned['text_length_category'] = fake_df_cleaned['text'].apply(categorize_text_length)

def perform_one_hot_encoding(df, categorical_columns):
    """Perform one-hot encoding on specified categorical columns"""
    df_encoded = df.copy()
    for col in categorical_columns:
        if col in df_encoded.columns:
            unique_values = df_encoded[col].unique()
            # Create dummy variables (one-hot encoding) as 0/1 integers
            dummies = pd.get_dummies(df_encoded[col], prefix=col, dtype=np.uint8)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
    return df_encoded

true_df_encoded = perform_one_hot_encoding(true_df_cleaned, ['text_length_category'])
fake_df_encoded = perform_one_hot_encoding(fake_df_cleaned, ['text_length_category'])
print("Available columns for one-hot encoding:")
print("True dataset columns:", list(true_df_cleaned.columns))
print("Fake dataset columns:", list(fake_df_cleaned.columns))
categorical_features = []

true_df_cleaned['text_length_category'] = true_df_cleaned['text'].apply(categorize_text_length)
fake_df_cleaned['text_length_category'] = fake_df_cleaned['text'].apply(categorize_text_length)
print("Text length categories in True dataset:")
print(true_df_cleaned['text_length_category'].value_counts())

print("\nText length categories in Fake dataset:")
print(fake_df_cleaned['text_length_category'].value_counts())

#Perform One Hot Encoding
true_df_encoded = perform_one_hot_encoding(true_df_cleaned, ['text_length_category'])
fake_df_encoded = perform_one_hot_encoding(fake_df_cleaned, ['text_length_category'])
print("\nTrue dataset after one-hot encoding:")
print("New columns:", [col for col in true_df_encoded.columns if 'text_length_category' in col])

print("\nFake dataset after one-hot encoding:")
print("New columns:", [col for col in fake_df_encoded.columns if 'text_length_category' in col])

print("\nSample of encoded True data:")
print(true_df_encoded[['text_length_category'] + [col for col in true_df_encoded.columns if 'text_length_category_' in col]].head())

print("\nSample of encoded Fake data:")
print(fake_df_encoded[['text_length_category'] + [col for col in fake_df_encoded.columns if 'text_length_category_' in col]].head())

true_df_cleaned = true_df_encoded
fake_df_cleaned = fake_df_encoded

print(f"True dataset shape after encoding: {true_df_cleaned.shape}")
print(f"Fake dataset shape after encoding: {fake_df_cleaned.shape}")
print(f"True dataset columns after encoding: {true_df_cleaned.columns}")
print(f"Fake dataset columns after encoding: {fake_df_cleaned.columns}")


true_df_cleaned.head()

fake_df_cleaned.head()




In [None]:
# # Get encoded columns
# encoded_cols = [col for col in true_df_cleaned.columns if 'text_length_category_' in col]

# if encoded_cols:
#     # Create the heatmap
#     plt.figure(figsize=(12, 8))

#     # Sample first 100 rows for visualization
#     encoded_matrix = true_df_cleaned[encoded_cols].head(100).values

#     # Create heatmap
#     im = plt.imshow(encoded_matrix.T, cmap='RdYlBu', aspect='auto')

#     plt.title('Text Length One-Hot Encoding Matrix\n(Each row = 1, others = 0)', fontsize=14, fontweight='bold')
#     plt.xlabel('Data Points (First 100)', fontsize=12)
#     plt.ylabel('Encoded Categories', fontsize=12)

#     plt.yticks(range(len(encoded_cols)),
#                [col.replace('text_length_category_', '') for col in encoded_cols])

#     plt.colorbar(im, label='Value (0 or 1)')

#     plt.grid(True, alpha=0.3)

#     # Save and show
#     plt.tight_layout()
#     plt.show()

## Word N-gram (Adriena Jiang)

Notes: Using n-gram, we can vectorize with TF-IDF or counts.

In [None]:
# column indicates which column n-gram is done on
# n determines whether it is a uni, bi, tri and so on
# top_k indicate how much to return
def word_ngrams(df, col, n=1, top_k=20, min_df=1):
    texts = df[col].astype(str).tolist()
    vec = CountVectorizer(ngram_range=(n, n), lowercase=True, token_pattern=r"(?u)\b[\w-]+\b", min_df=min_df)
    X = vec.fit_transform(texts)
    counts = np.asarray(X.sum(axis=0)).ravel()
    vocab = vec.get_feature_names_out()
    out = (pd.DataFrame({"ngram": vocab, "count": counts}).sort_values("count", ascending=False).head(top_k).reset_index(drop=True))
    return out

### Word n-gram for Text (Adriena Jiang)

In [None]:
# Top 20 unigrams/bigrams/trigrams in TRUE articles
top_uni_true  = word_ngrams(true_df_cleaned, col="text", n=1, top_k=20)
top_bi_true   = word_ngrams(true_df_cleaned, col="text", n=2, top_k=20)
top_tri_true  = word_ngrams(true_df_cleaned, col="text", n=3, top_k=20)

# Top 20 unigrams/bigrams/trigrams in FAKE articles
top_uni_fake  = word_ngrams(fake_df_cleaned, col="text", n=1, top_k=20)
top_bi_fake   = word_ngrams(fake_df_cleaned, col="text", n=2, top_k=20)
top_tri_fake  = word_ngrams(fake_df_cleaned, col="text", n=3, top_k=20)

In [None]:
# Show
display(top_uni_true)
display(top_uni_fake)
display(top_bi_true)
display(top_bi_fake)
display(top_tri_true)
display(top_tri_fake)

### Word n-gram for Title (Adriena Jiang)

In [None]:
# Top n-grams on TITLE (True vs Fake)
top_uni_title_true = word_ngrams(true_df_cleaned, col="title", n=1, top_k=20, min_df=2)
top_bi_title_true  = word_ngrams(true_df_cleaned, col="title", n=2, top_k=20, min_df=2)
top_tri_title_true = word_ngrams(true_df_cleaned, col="title", n=3, top_k=20, min_df=2)

top_uni_title_fake = word_ngrams(fake_df_cleaned, col="title", n=1, top_k=20, min_df=2)
top_bi_title_fake  = word_ngrams(fake_df_cleaned, col="title", n=2, top_k=20, min_df=2)
top_tri_title_fake = word_ngrams(fake_df_cleaned, col="title", n=3, top_k=20, min_df=2)

In [None]:
display(top_uni_title_true)
display(top_uni_title_fake)
display(top_bi_title_true)
display(top_bi_title_fake)
display(top_tri_title_true)
display(top_tri_title_fake)

## Character N-gram (Adriena Jiang)

In [None]:
def char_ngrams(df, col, ngram_range=(3,5), top_k=20, min_df=2, max_df=0.95, analyzer="char_wb"):
    texts = df[col].astype(str).tolist()
    vec = CountVectorizer(analyzer=analyzer, ngram_range=ngram_range, lowercase=True, min_df=min_df, max_df=max_df)
    X = vec.fit_transform(texts)
    counts = np.asarray(X.sum(axis=0)).ravel()
    vocab = vec.get_feature_names_out()
    out = (pd.DataFrame({"ngram": vocab, "count": counts}).sort_values("count", ascending=False).head(top_k).reset_index(drop=True))
    return out

### Character n-gram for Text (Adriena Jiang)

In [None]:
top_char_text_true = char_ngrams(true_df_cleaned, col="text", ngram_range=(3,5), top_k=20)
top_char_text_fake = char_ngrams(fake_df_cleaned,  col="text", ngram_range=(3,5), top_k=20)

In [None]:
display(top_char_text_true)
display(top_char_text_fake)

### Character n-gram for Title (Adriena Jiang)

In [None]:
top_char_title_true = char_ngrams(true_df_cleaned, col="title", ngram_range=(3,5), top_k=20)
top_char_title_fake = char_ngrams(fake_df_cleaned,  col="title", ngram_range=(3,5), top_k=20)

In [None]:
display(top_char_title_true)
display(top_char_title_fake)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Add dataset labels
true_text = top_char_text_true.copy()
fake_text = top_char_text_fake.copy()

true_text["Dataset"] = "True"
fake_text["Dataset"] = "Fake"

compare_text = pd.concat([true_text, fake_text])

plt.figure(figsize=(12,7))
sns.barplot(data=compare_text, x="count", y="ngram", hue="Dataset")
plt.title("Top Character N-grams in TEXT: True vs Fake", fontsize=15)
plt.xlabel("Frequency")
plt.ylabel("Character N-gram")
plt.tight_layout()
plt.show()



# Add dataset labels
true_title = top_char_title_true.copy()
fake_title = top_char_title_fake.copy()

true_title["Dataset"] = "True"
fake_title["Dataset"] = "Fake"

compare_title = pd.concat([true_title, fake_title])

plt.figure(figsize=(12,7))
sns.barplot(data=compare_title, x="count", y="ngram", hue="Dataset")
plt.title("Top Character N-grams in TITLES: True vs Fake", fontsize=15)
plt.xlabel("Frequency")
plt.ylabel("Character N-gram")
plt.tight_layout()
plt.show()


## Sentiment Features (Sanskriti Khadka)

In [None]:
# Sentiment Features - Sanskriti Khadka
# takes 2 mins to run

TEXT_COL = 'text'

# Initialize VADER analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to extract sentiment scores for each article
def get_vader_scores(text):
    scores = analyzer.polarity_scores(str(text))
    return pd.Series({
        'vader_neg': scores['neg'],
        'vader_neu': scores['neu'],
        'vader_pos': scores['pos'],
        'vader_compound': scores['compound']
    })

# Applying to both datasets
true_df_cleaned[['vader_neg','vader_neu','vader_pos','vader_compound']] = true_df_cleaned[TEXT_COL].apply(get_vader_scores)
fake_df_cleaned[['vader_neg','vader_neu','vader_pos','vader_compound']] = fake_df_cleaned[TEXT_COL].apply(get_vader_scores)

# Sample results
print(f"\nSample TRUE news sentiment scores:")
display(true_df_cleaned[['text', 'vader_neg','vader_neu','vader_pos','vader_compound']].head())

print(f"\nSample FAKE news sentiment scores:")
display(fake_df_cleaned[['text', 'vader_neg','vader_neu','vader_pos','vader_compound']].head())

In [None]:
#Visualize the sentiment analysis
sent_cols = ["vader_neg", "vader_neu", "vader_pos", "vader_compound"]

avg_true = true_df_cleaned[sent_cols].mean()
avg_fake = fake_df_cleaned[sent_cols].mean()

avg_df = pd.DataFrame({
    "sentiment": sent_cols,
    "True": avg_true.values,
    "Fake": avg_fake.values
})

plt.figure(figsize=(10,5))
avg_df.set_index("sentiment").plot(kind="bar", figsize=(10,5), color=["blue","red"])
plt.title("Average Sentiment Scores – TRUE vs FAKE")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.show()


## Train Test Split (Kashvi Vijay)



In [None]:
from sklearn.model_selection import train_test_split

# Combine the true and fake dataframes and add a 'label' column
true_df_cleaned['label'] = 0  # 0 for true
fake_df_cleaned['label'] = 1  # 1 for fake

combined_df = pd.concat([true_df_cleaned, fake_df_cleaned], ignore_index=True)

# Define features (X) and target (y)
# Using 'text' column as feature for now, other features can be added later
X = combined_df['text']
y = combined_df['label']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train set shape (X_train):", X_train.shape)
print("Test set shape (X_test):", X_test.shape)
print("Train set shape (y_train):", y_train.shape)
print("Test set shape (y_test):", y_test.shape)

In [None]:
#Visulaize Test/Split

import matplotlib.pyplot as plt

sizes = [len(X_train), len(X_test)]
labels = ['Training Set (80%)', 'Test Set (20%)']
colors = ['#5A8DEE', '#FF6F61']

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title("Train–Test Split (80/20)", fontsize=14, fontweight='bold')
plt.axis('equal')
plt.show()


plt.figure(figsize=(8,5))
plt.bar(['Train', 'Test'], [len(X_train), len(X_test)], color=['#5A8DEE','#FF6F61'])
plt.title("Train–Test Split (80/20)", fontsize=14, fontweight='bold')
plt.ylabel("Number of Samples")
plt.show()



## TF-IDF (Nancy Huang)

converts raw text data into numerical feature representation using TF-IDF features generates word-level and character-level. The two matrices returned will be combined to used to train the classification models.


In [None]:
# initialize vectorizer to look at words and common phrases (unigrams + bigrams)
word_tfidf = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1,2),
    min_df=5,
    max_df=0.95,
    max_features=10000,
    stop_words="english"
)
# initialize vectorizer to look at character-level n-grams (3-5 characters)
char_tfidf= TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    min_df=5,
    max_features=10000
)
#fits the vectorizers on the training text
#learns the vocabulary and computes the TF-IDF weight
X_train_word_tfidf = word_tfidf.fit_transform(X_train)
X_train_char_tfidf = char_tfidf.fit_transform(X_train)

#apply the learned word vocabular to the test data
X_test_word = word_tfidf.transform(X_test)
X_test_char = char_tfidf.transform(X_test)

#combining the word and character TF-IDF matrices to feature both types
X_train_tfidf = hstack([X_train_word_tfidf, X_train_char_tfidf]).tocsr()
X_test_tfidf  = hstack([X_test_word, X_test_char]).tocsr()

#rows correspond to the number of articles
#columns corresponds to the number of features (vocabulary size)
print("TF-IDF shapes:")
print("  X_train_word :", X_train_word_tfidf.shape)
print("  X_train_char :", X_train_char_tfidf.shape)
print("  X_train_tfidf:", X_train_tfidf.shape)
print("  X_test_word  :", X_test_word.shape)
print("  X_test_tfidf :", X_test_tfidf.shape)

In [None]:
#Visualize TF-IDF
X_train_word_tfidf.shape   # (rows, word features)
X_train_char_tfidf.shape   # (rows, char features)
X_train_tfidf.shape        # combined


tfidf_sizes = pd.DataFrame({
    "Type": ["Word TF-IDF", "Char TF-IDF", "Combined TF-IDF"],
    "Features": [
        X_train_word_tfidf.shape[1],
        X_train_char_tfidf.shape[1],
        X_train_tfidf.shape[1]
    ]
})

plt.figure(figsize=(8,5))
sns.barplot(data=tfidf_sizes, x="Type", y="Features", palette="Purples")
plt.title("TF-IDF Feature Space Size")
plt.ylabel("Number of Features")
plt.xticks(rotation=20)
plt.show()

plt.figure(figsize=(7,5))

sizes = [
    X_train_word_tfidf.shape[1],
    X_train_char_tfidf.shape[1]
]

plt.bar(["Combined"], [sum(sizes)], color="lightgray", edgecolor="black")
plt.bar(["Combined"], [sizes[0]], color="blue", label="Word TF-IDF")
plt.bar(["Combined"], [sizes[1]], bottom=[sizes[0]], color="purple", label="Char TF-IDF")

plt.title("Composition of Combined TF-IDF Features")
plt.ylabel("Number of Features")
plt.legend()
plt.show()


# Modeling

## Logistic Regression (Kashvi Vijay)



**Kashvi Vijay - Model Optimizations**


Orgininally, I tried using GridSearchCV, but noticed that it was inefficient, so I switched my approach. This approach is more efficient than traditional GridSearchCV because HalvingGridSearchCV adaptively focuses on the most promising hyperparameters instead of testing every single combination. By caching TF-IDF transformations and using fewer cross-validation folds, the tuning process runs much faster while maintaining similar accuracy. Overall, it’s a quicker and more practical way to optimize models for text-based machine learning tasks.

In [None]:
# imports
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import Memory
import tempfile
import numpy as np

# -------------------- sampling --------------------
# sample ~10% of the data for tuning (e.g., 4k if 40k total)
rng = np.random.default_rng(42)
sample_size = int(0.1 * len(X_train))
idx = rng.choice(len(X_train), sample_size, replace=False)

# robust indexing (works for pandas, numpy, or list)
def subset(data, indices):
    if hasattr(data, "iloc"):      # pandas DataFrame or Series
        return data.iloc[indices]
    elif hasattr(data, "__getitem__"):  # list or np.array
        return [data[i] for i in indices]
    else:
        raise TypeError("Unsupported data type for X_train/y_train")

X_sub = subset(X_train, idx)
y_sub = subset(y_train, idx)

# -------------------- cache --------------------
cachedir = tempfile.mkdtemp()
memory = Memory(cachedir, verbose=0)

# -------------------- tf-idf --------------------
char_tfidf.set_params(max_features=3000)

# -------------------- pipeline --------------------
pipeline = Pipeline([
    ('tfidf', char_tfidf),
    ('logreg', LogisticRegression(
        solver='saga',
        max_iter=80,
        tol=5e-3,
        random_state=42
    ))
], memory=memory)

# -------------------- lean grid --------------------
param_grid = {
    'logreg__C': [0.5, 1.0],
    'logreg__penalty': ['l2']
}

# -------------------- halving search --------------------
grid_search = HalvingGridSearchCV(
    pipeline,
    param_grid,
    cv=2,
    factor=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=0,
    aggressive_elimination=True
)

print(f"Running quick halving grid search on {sample_size:,} samples…")
grid_search.fit(X_sub, y_sub)

# -------------------- evaluate on full test --------------------
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nBest params:", grid_search.best_params_)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# # grid search gives the best model
# best_model = grid_search.best_estimator_

# # predictions on test
# y_pred = best_model.predict(X_test)

# # evaluate model
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)
# conf_matrix = confusion_matrix(y_test, y_pred)

# print(f"\nModel Accuracy: {accuracy:.4f}")
# print("\nClassification Report:")
# print(report)
# print("\nConfusion Matrix:")
# print(conf_matrix)

## BERT (Nancy Huang, Lin Zhang, Adriena Jiang)

RESULTS TOO SUSPICIOUS, need to re-evaluate data cleaning, data leakage may (definietly) have occured

Discussion topics during meeting: we have to go back to data cleaning and resolve some issues that causing our data leak and aspects we did not fully consider before.

To start --> I decided to remove all URL tokens from the articles because after thinking about it, having "< URL >" is also giving away that fake news are fake because it shown that only fake news articles really had those links so it was reasonable to drop them completely.

Some additional patterns I noticed:

- we should drop the subject column (I remember we decided to reinstate this but there is a lot of political news which makes it easy for the model to catch up on the fake/true news)
- removing stop words consideration, after doing some research it seems like removing stop words is hurting the BERT model's training so we should consider cloning and having a seperate dataframe dedicated to BERT (or use the one where stop words aren't removed) : text_for_bert
- thoughts on removing: location/social media handles
- ngrams showed that the fake news articles also had image urls and some videos which does not help us in any way so thoughts on removing them?

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
import gc
import torch

In [None]:
print("GPU Available:", torch.cuda.is_available())

BERT stands for Bidirectional Encoder Represenatation from Transformers

It's a pretrained language model that understands the context of the text in both directions so left to right and right to left.

In [None]:
#for quick testing, will affect the results
SAMPLE_SIZE = 5000
print(f"\nUsing {SAMPLE_SIZE} training samples for faster training")
X_train_sample = X_train.sample(n=min(SAMPLE_SIZE, len(X_train)), random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

X_test_sample = X_test.sample(n=min(1000, len(X_test)), random_state=42)
y_test_sample = y_test.loc[X_test_sample.index]

# converts raw text into numbers so that BERT can understand them through tokenization
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True, #CLS (start) and SEP (end) tokens
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

print("Loading BERT model...")
#loads the pretrained BERT from Google
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

print("Creating datasets...")
train_dataset = NewsDataset(X_train_sample, y_train_sample, tokenizer)
test_dataset = NewsDataset(X_test_sample, y_test_sample, tokenizer)

#GPU processing (switched from CPU because it was taking way too long)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
print(f"Training batches: {len(train_loader)}")
print(f"Testing batches: {len(test_loader)}")

#trying to reduce error by learning from mistakes
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
epochs = 2
#training
for epoch in range(epochs):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"{'='*50}")

    model.train()
    train_loss = 0
    batch_count = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        #bidirectional
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        batch_count += 1

        if batch_count % 50 == 0:
            print(f"Batch {batch_count}/{len(train_loader)}, Loss: {loss.item():.4f}")

        if batch_count % 100 == 0 and torch.cuda.is_available():
            torch.cuda.empty_cache()

    avg_train_loss = train_loss / len(train_loader)
    print(f"\nAverage Training Loss: {avg_train_loss:.4f}")

#extra tests and training on model on unseen data
    print("Evaluating on test set...")
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    print(f"Test Accuracy: {accuracy:.4f}")

print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['True News', 'Fake News']))

print("\nConfusion Matrix:")
print(confusion_matrix(true_labels, predictions))

# del model
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
print("Train distribution:")
print(y_train_sample.value_counts())
print("\nTest distribution:")
print(y_test_sample.value_counts())

from collections import Counter
print("\nPrediction distribution:")
print(Counter(predictions))

Model without exclamation and question marks

In [None]:

try:
    acc_baseline = accuracy_no_locations
except NameError:
    # If location removal test wasn't run, use the original accuracy
    try:
        acc_baseline = acc_with_locations
    except NameError:
        # If neither exists, use the original accuracy variable
        acc_baseline = accuracy
print(f"\nBaseline accuracy for punctuation test: {acc_baseline:.4f} ({acc_baseline*100:.2f}%)")


"""## BERT Data Leakage Analysis: Exclamation & Question Mark Removal (Lin Zhang)

Testing if exclamation marks and question marks are causing data leakage.
Analysis showed fake news has 12x more exclamation marks and question marks.
"""

def remove_exclamation_questions(text):
    """Remove exclamation marks and question marks from text"""
    if not isinstance(text, str) or not text.strip():
        return text

    # Replace ! and ? with periods to maintain sentence structure
    text_cleaned = text.replace('!', '.')
    text_cleaned = text_cleaned.replace('?', '.')

    # Clean up multiple periods
    text_cleaned = re.sub(r'\.{2,}', '.', text_cleaned)
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned)

    return text_cleaned.strip()

print("\n" + "="*80)
print("BERT DATA LEAKAGE ANALYSIS: Testing Exclamation & Question Mark Removal")
print("="*80)

# Create version without exclamation/question marks
print("\nRemoving exclamation marks (!) and question marks (?) from text...")
X_train_no_punct = X_train_sample.apply(lambda x: remove_exclamation_questions(str(x)))
X_test_no_punct = X_test_sample.apply(lambda x: remove_exclamation_questions(str(x)))

print("Creating datasets without exclamation/question marks...")
# Recreate tokenizer if needed
if 'tokenizer' not in locals() or tokenizer is None:
    print("Recreating tokenizer...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset_no_punct = NewsDataset(X_train_no_punct, y_train_sample, tokenizer)
test_dataset_no_punct = NewsDataset(X_test_no_punct, y_test_sample, tokenizer)

# Use smaller batch size to save memory
BATCH_SIZE_NO_PUNCT = 8
train_loader_no_punct = DataLoader(train_dataset_no_punct, batch_size=BATCH_SIZE_NO_PUNCT, shuffle=True)
test_loader_no_punct = DataLoader(test_dataset_no_punct, batch_size=BATCH_SIZE_NO_PUNCT, shuffle=False)

print("Loading BERT model for punctuation-removed training...")
# Clear any remaining memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

model_no_punct = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model_no_punct.to(device)

# Verify memory
if torch.cuda.is_available():
    print(f"Model loaded. GPU memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

optimizer_no_punct = torch.optim.Adam(model_no_punct.parameters(), lr=2e-5)
epochs = 2

print(f"\nTraining BERT WITHOUT exclamation/question marks ({epochs} epochs)...")
for epoch in range(epochs):
    print(f"\n{'='*50}")
    print(f"Epoch {epoch + 1}/{epochs} (NO ! ?)")
    print(f"{'='*50}")

    model_no_punct.train()
    train_loss = 0
    batch_count = 0

    for batch in train_loader_no_punct:
        optimizer_no_punct.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model_no_punct(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer_no_punct.step()

        train_loss += loss.item()
        batch_count += 1

        if batch_count % 50 == 0:
            print(f"Batch {batch_count}/{len(train_loader_no_punct)}, Loss: {loss.item():.4f}")
            # Clear cache more frequently to save memory
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    avg_train_loss = train_loss / len(train_loader_no_punct)
    print(f"\nAverage Training Loss: {avg_train_loss:.4f}")

# Evaluate without exclamation/question marks
print("\nEvaluating on test set (NO ! ?)...")
model_no_punct.eval()
predictions_no_punct = []
true_labels_no_punct = []

with torch.no_grad():
    for batch in test_loader_no_punct:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model_no_punct(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        predictions_no_punct.extend(preds.cpu().numpy())
        true_labels_no_punct.extend(labels.cpu().numpy())

accuracy_no_punct = accuracy_score(true_labels_no_punct, predictions_no_punct)
print(f"Test Accuracy (NO ! ?): {accuracy_no_punct:.4f}")

# Clean up
print("\nCleaning up GPU memory...")
# del model_no_punct
# del train_loader_no_punct
# del test_loader_no_punct
# del train_dataset_no_punct
# del test_dataset_no_punct
# del optimizer_no_punct
gc.collect()

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print(f"Cleanup complete. Free GPU memory: {torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0):.2f} MB")

# ============================================================================
# COMPARISON: WITH vs WITHOUT Exclamation/Question Marks
# ============================================================================
print("\n" + "="*80)
print("COMPARISON: WITH vs WITHOUT Exclamation/Question Marks")
print("="*80)

print(f"\nResults:")
print(f"┌─────────────────────────────────────────────────────────┐")
print(f"│ WITH ! ? (baseline):     {acc_baseline:.4f} ({acc_baseline*100:.2f}%)                    │")
print(f"│ WITHOUT ! ?:             {accuracy_no_punct:.4f} ({accuracy_no_punct*100:.2f}%)                    │")
print(f"│ Difference:              {abs(acc_baseline - accuracy_no_punct):.4f} ({abs(acc_baseline - accuracy_no_punct)*100:.2f}%)                    │")
print(f"└─────────────────────────────────────────────────────────┘")

print(f"\nAnalysis:")
if acc_baseline > accuracy_no_punct + 0.05:
    print(f"WARNING: Removing ! ? reduced accuracy by {abs(acc_baseline - accuracy_no_punct)*100:.2f}%")
    print(f"   -> This suggests exclamation/question marks WERE contributing to data leakage")
    print(f"   -> Model was learning '! ? = fake news' instead of content patterns")
    print(f"   -> This is a legitimate signal but may be dataset-specific")
elif accuracy_no_punct > acc_baseline + 0.02:
    print(f"INFO: Removing ! ? improved accuracy by {abs(accuracy_no_punct - acc_baseline)*100:.2f}%")
    print(f"   -> This suggests punctuation was adding slight noise")
    print(f"   -> Use model WITHOUT punctuation")
else:
    print(f"KEY FINDING: Punctuation removal had {abs(acc_baseline - accuracy_no_punct)*100:.2f}% impact")
    if abs(acc_baseline - accuracy_no_punct) < 0.03:
        print(f"   -> Exclamation/question marks are NOT the main source of high accuracy")
        print(f"   -> Other writing style features are more important")
    else:
        print(f"   -> Exclamation/question marks contribute to accuracy but not the only factor")

print(f"\nAccuracy Assessment:")
if accuracy_no_punct > 0.85:
    print(f"WARNING: WITHOUT ! ?: {accuracy_no_punct*100:.2f}% is still high - other leakage sources exist")
    print(f"   -> Need to investigate other writing style features")
elif 0.65 <= accuracy_no_punct <= 0.85:
    print(f"SUCCESS: WITHOUT ! ?: {accuracy_no_punct*100:.2f}% is in realistic range (65-85%)")
    print(f"   -> Exclamation/question marks were causing data leakage")
elif accuracy_no_punct < 0.65:
    print(f"WARNING: WITHOUT ! ?: {accuracy_no_punct*100:.2f}% is low - may need more training")

print(f"\n" + "="*80)
print("CONCLUSION: Data Leakage Investigation")
print("="*80)
print(f"\nKey Findings:")
print(f"  1. Exclamation/Question removal: {abs(acc_baseline - accuracy_no_punct)*100:.2f}% impact")
print(f"  2. Baseline (with ! ?): {acc_baseline*100:.2f}%")
print(f"  3. Without ! ?: {accuracy_no_punct*100:.2f}%")
print(f"\nInterpretation:")





# LR MODEL: Content Only (50 Features)

   What We Used:
   - Text content only (no metadata)
   - Removes URLs, news organization names (Reuters, CNN, etc.), and social media artifacts
   - Removes stop words (is, the, a, etc.)
   - Converts text into numerical features using TF-IDF
   - Top 50 most common words
   - Strong regularization (C=0.001); Regularization reduces overfitting
   - Cross-validation (5-fold)

**Test accuracy: ~74.27%**

**Cross-validation accuracy: ~72.45% ± 0.45%**

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.dummy import DummyClassifier
from scipy.sparse import hstack
import re
import warnings

# ============================================================================
# CROSS-VALIDATION SETUP
# ============================================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# ============================================================================
# MODEL 1: Content Only - Minimal Features (50)
# ============================================================================


print("\n" + "=" * 80)
print("MODEL Content Only (50 Features)")
print("=" * 80)
print("\n   What We Used:")
print("   - Text content only (no metadata)")
print("   - TF-IDF word features (unigrams only)")
print("   - Top 50 most common words")
print("   - Strong regularization (C=0.001)")
print("   - Cross-validation (5-fold)")

tfidf_50 = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 1),
    min_df=500,
    max_df=0.60,
    max_features=50,
    stop_words="english"
)

X_train_50 = tfidf_50.fit_transform(X_train)
X_test_50 = tfidf_50.transform(X_test)

lr_50 = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, C=0.001)
lr_50.fit(X_train_50, y_train)

test_pred_50 = lr_50.predict(X_test_50)
test_acc_50 = accuracy_score(y_test, test_pred_50)

cv_scores_50 = cross_val_score(lr_50, X_train_50, y_train, cv=cv, scoring='accuracy', n_jobs=-1)

print(f"\n   Results:")
print(f"   Test Accuracy: {test_acc_50:.4f} ({test_acc_50*100:.2f}%)")
print(f"   CV Accuracy:   {cv_scores_50.mean():.4f} ({cv_scores_50.mean()*100:.2f}%) ± {cv_scores_50.std():.4f}")
print(f"   Train-Test Gap: {abs(lr_50.score(X_train_50, y_train) - test_acc_50):.4f}")


## Neural Networks (Sanskriti K, Ousman, Harshika)

In this section, I use TensorFlow/Keras to train two neural networks that can tell apart real vs. fake news based on the text content.
Both models take the article text, turn it into word tokens, and learn patterns directly from the words.

1. AVG Model (Fast Baseline) – a simple network that averages word embeddings.


2. CNN Model (1D Convolution) – a slightly deeper model that learns local patterns (like short word phrases) using filters.It can capture context better.

Both models performed very well on the test data.
The AVG model reached 96.3% accuracy.
The CNN model performed even better, with 98.4% accuracy.
The confusion matrices confirm that both models made very few mistakes, especially on real news articles.

However, overfitting may exist.

In [None]:
# Neural Networks with TensorFlow/Keras
import os, gc, numpy as np, tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# SAMPLING FOR FASTER TRAINING
SAMPLE_SIZE = 5000

# Sample from train/test
X_train_sample = X_train.sample(n=min(SAMPLE_SIZE, len(X_train)), random_state=1234)
y_train_sample = y_train.loc[X_train_sample.index]

X_test_sample = X_test.sample(n=min(1000, len(X_test)), random_state=1234)
y_test_sample = y_test.loc[X_test_sample.index]


# DATA PREPARATION
# Use the cleaned text column directly
TEXT_COL_DL = 'text'  # This has stop words removed and is cleaned

# get sampled texts
train_texts = combined_df.loc[X_train_sample.index, TEXT_COL_DL].astype(str).values
test_texts  = combined_df.loc[X_test_sample.index,  TEXT_COL_DL].astype(str).values
y_train_np  = y_train_sample.values.astype(np.int32)
y_test_np   = y_test_sample.values.astype(np.int32)

# text -> token ids
SEED = 1234
tf.keras.utils.set_random_seed(SEED)
VOCAB_SIZE = 20000  # reduced from 50000 for speed
SEQUENCE_LENGTH = 200  # reduced from 300 for speed

vectorizer = layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
    standardize="lower_and_strip_punctuation",
)
vectorizer.adapt(train_texts)

# fast input pipeline
BATCH_SIZE = 128  # increased for faster training
AUTOTUNE = tf.data.AUTOTUNE

def make_ds(texts, labels, training=True):
    ds = tf.data.Dataset.from_tensor_slices((texts, labels))
    if training:
        ds = ds.shuffle(min(len(texts), 5000), seed=SEED)
    ds = ds.batch(BATCH_SIZE).prefetch(AUTOTUNE)
    return ds

train_ds = make_ds(train_texts, y_train_np, training=True)
test_ds  = make_ds(test_texts,  y_test_np, training=False)

def vec_map(x, y):
    return vectorizer(x), y

train_ds = train_ds.map(vec_map, num_parallel_calls=AUTOTUNE)
test_ds  = test_ds.map(vec_map,  num_parallel_calls=AUTOTUNE)

# training helpers
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-5, verbose=1)

# Model A: Embedding + GlobalAverage
def build_avg_model(vocab_size=VOCAB_SIZE, seq_len=SEQUENCE_LENGTH, emb_dim=64, dropout=0.3):
    inputs = layers.Input(shape=(seq_len,), dtype=tf.int64)
    x = layers.Embedding(vocab_size, emb_dim)(inputs)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs, name="avg_pool_model")
    model.compile(optimizer=keras.optimizers.Adam(1e-3),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model


print("\nTraining AVG Model")
print("-"*50)
avg_model = build_avg_model()

history_avg = avg_model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=5,
    callbacks=[early_stop, reduce_lr],
    verbose=0  # silent training
)
print(f"Final Val Accuracy: {history_avg.history['val_accuracy'][-1]:.4f}")

# eval
avg_probs = avg_model.predict(test_ds, verbose=0).ravel()
avg_pred  = (avg_probs >= 0.5).astype(int)
avg_acc   = accuracy_score(y_test_np, avg_pred)
print(f"\n[AVG Model] Accuracy: {avg_acc:.4f}")
print(classification_report(y_test_np, avg_pred, target_names=['True', 'Fake']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_np, avg_pred))


# Model B: Embedding + 1D CNN
def build_cnn_model(vocab_size=VOCAB_SIZE, seq_len=SEQUENCE_LENGTH,
                    emb_dim=64, num_filters=64, kernel_size=5, dropout=0.3):
    inputs = layers.Input(shape=(seq_len,), dtype=tf.int64)
    x = layers.Embedding(vocab_size, emb_dim)(inputs)
    x = layers.Conv1D(num_filters, kernel_size, padding="same", activation="relu")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs, name="cnn_1d_model")
    model.compile(optimizer=keras.optimizers.Adam(1e-3),  # increased from 2e-4
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model

print("\nTraining CNN Model")
print("-"*50)
cnn_model = build_cnn_model()

history_cnn = cnn_model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=10,
    callbacks=[early_stop, reduce_lr],
    verbose=0  # silent training
)
print(f"Final Val Accuracy: {history_cnn.history['val_accuracy'][-1]:.4f}")

# eval
cnn_probs = cnn_model.predict(test_ds, verbose=0).ravel()
cnn_pred  = (cnn_probs >= 0.5).astype(int)
cnn_acc   = accuracy_score(y_test_np, cnn_pred)
print(f"\n[CNN Model] Accuracy: {cnn_acc:.4f}")
print(classification_report(y_test_np, cnn_pred, target_names=['True', 'Fake']))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_np, cnn_pred))

# cleanup
# del avg_model, cnn_model
gc.collect()

##SAVE MODELS

In [None]:
import joblib
import torch
from tensorflow import keras
import os

# --- Setup ---
model_dir = 'Trained_models'
os.makedirs(model_dir, exist_ok=True)
print(f"Saving models to directory: {model_dir}/")
print("-" * 50)

# --- 1. Logistic Regression Model (Scikit-learn/joblib) ---
# Check if the model exists before saving
try:
    joblib.dump(log_reg, os.path.join(model_dir, 'logistic_regression_model.joblib'))
    print("✅ Logistic Regression Model saved.")
except NameError:
    print("⚠️ Logistic Regression model not found. Skipping...")

# --- 2. BERT Model (PyTorch/Hugging Face) ---
# Note: BERT model needs to be saved differently if using transformers
try:
    # If using transformers library
    model.save_pretrained(os.path.join(model_dir, 'bert_model'))
    tokenizer.save_pretrained(os.path.join(model_dir, 'bert_model'))
    print("✅ BERT Model saved.")
except NameError:
    print("⚠️ BERT model not found. Skipping...")

# --- 3. AVG Model (Keras/TensorFlow) ---
try:
    avg_model.save(os.path.join(model_dir, 'avg_pool_model.keras'))
    print("✅ AVG Pool Keras Model saved.")
except NameError:
    print("⚠️ AVG Pool model not found. Skipping...")

# --- 4. CNN Model (Keras/TensorFlow) ---
try:
    cnn_model.save(os.path.join(model_dir, 'cnn_1d_model.keras'))
    print("✅ CNN 1D Keras Model saved.")
except NameError:
    print("⚠️ CNN model not found. Skipping...")

print("-" * 50)
print("Model saving process completed.")