# Part 4

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from joblib import load, dump
import time
import os

test_df = pd.read_csv('test_data.csv')
# print(f"Test set size: {len(test_df)}")
# print(f"Test set distribution: \n{test_df['type'].value_counts()}")

plots_dir = os.path.join(os.getcwd(), 'plots')
models_dir = os.path.join(os.getcwd(), 'models')
os.makedirs(models_dir, exist_ok=True)

# Add and then apply binary classification
if 'binary_type' not in test_df.columns:
    reliable_types = ['reliable']
    fake_types = ['fake', 'conspiracy', 'junksci', 'hate', 'unreliable', 
                  'bias', 'satire', 'clickbait', 'political', 'rumor', 'unknown']
    def create_binary_label(news_type):
        if pd.isna(news_type):
            return np.nan
        elif news_type in reliable_types:
            return 'reliable'
        elif news_type in fake_types:
            return 'fake'
        else:
            return np.nan
    test_df['binary_type'] = test_df['type'].apply(create_binary_label)

#print(f"Binary test set distribution: \n{test_df['binary_type'].value_counts()}")

if os.path.exists(os.path.join(models_dir, 'baseline_model.joblib')) and os.path.exists(os.path.join(models_dir, 'advanced_model.joblib')):
    baseline_model = load(os.path.join(models_dir, 'baseline_model.joblib'))
    advanced_model = load(os.path.join(models_dir, 'advanced_model.joblib'))
    count_vectorizer = load(os.path.join(models_dir, 'count_vectorizer.joblib'))
    tfidf_vectorizer = load(os.path.join(models_dir, 'tfidf_vectorizer.joblib'))
else:
    # Train baseline model
    train_df = pd.read_csv('train_data.csv')
    # Add binary labels to training data if needed
    if 'binary_type' not in train_df.columns:
        print("Creating binary classification labels for training data...")
        reliable_types = ['reliable']
        fake_types = ['fake', 'conspiracy', 'junksci', 'hate', 'unreliable', 
                      'bias', 'satire', 'clickbait', 'political', 'rumor', 'unknown']
        
        train_df['binary_type'] = train_df['type'].apply(lambda x: 'reliable' if x in reliable_types else 'fake' if x in fake_types else np.nan)
    # Baseline model (Logistic Regression with CountVectorizer)
    count_vectorizer = CountVectorizer(max_features=10000)
    X_train_count = count_vectorizer.fit_transform(train_df['content'])
    y_train = train_df['binary_type']
    baseline_model = LogisticRegression(max_iter=1000, random_state=42)
    baseline_model.fit(X_train_count, y_train)
    # Advanced model (Neural Network with TF-IDF)
    tfidf_vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.8, ngram_range=(1, 2), sublinear_tf=True)
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['content'])
    advanced_model = MLPClassifier(hidden_layer_sizes=(100,), alpha=0.0001, random_state=42, max_iter=300, early_stopping=True)
    advanced_model.fit(X_train_tfidf, y_train)
    # Save models for future use
    dump(baseline_model, os.path.join(models_dir, 'baseline_model.joblib'))
    dump(advanced_model, os.path.join(models_dir, 'advanced_model.joblib'))
    dump(count_vectorizer, os.path.join(models_dir, 'count_vectorizer.joblib'))
    dump(tfidf_vectorizer, os.path.join(models_dir, 'tfidf_vectorizer.joblib'))

# Task 1: Evaluate on FakeNewsCorpus test set

In [2]:
X_test_count = count_vectorizer.transform(test_df['content'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['content'])
y_test = test_df['binary_type']

# Evaluate baseline model
baseline_pred = baseline_model.predict(X_test_count)
fakenews_baseline_f1 = f1_score(y_test, baseline_pred, pos_label='fake')
print(f"\nBaseline Model (Logistic Regression) - F1 Score: {fakenews_baseline_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, baseline_pred))

# Evaluate advanced model
advanced_pred = advanced_model.predict(X_test_tfidf)
fakenews_advanced_f1 = f1_score(y_test, advanced_pred, pos_label='fake')
print(f"\nAdvanced Model (Neural Network) - F1 Score: {fakenews_advanced_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, advanced_pred))

# Create confusion matrices
cm_baseline = confusion_matrix(y_test, baseline_pred)
cm_advanced = confusion_matrix(y_test, advanced_pred)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', xticklabels=['reliable', 'fake'], yticklabels=['reliable', 'fake'], ax=axes[0])
axes[0].set_title('Confusion Matrix - Baseline Model (FakeNewsCorpus)', fontsize=14)
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('Actual', fontsize=12)
sns.heatmap(cm_advanced, annot=True, fmt='d', cmap='Blues', xticklabels=['reliable', 'fake'], yticklabels=['reliable', 'fake'], ax=axes[1])
axes[1].set_title('Confusion Matrix - Advanced Model (FakeNewsCorpus)', fontsize=14)
axes[1].set_xlabel('Predicted', fontsize=12)
axes[1].set_ylabel('Actual', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'Part4Task1_confusion_matrices_fakenews.png'))
plt.close()


Baseline Model (Logistic Regression) - F1 Score: 0.9412

Classification Report:
              precision    recall  f1-score   support

        fake       0.90      0.99      0.94        73
    reliable       0.93      0.64      0.76        22

    accuracy                           0.91        95
   macro avg       0.92      0.81      0.85        95
weighted avg       0.91      0.91      0.90        95


Advanced Model (Neural Network) - F1 Score: 0.9419

Classification Report:
              precision    recall  f1-score   support

        fake       0.89      1.00      0.94        73
    reliable       1.00      0.59      0.74        22

    accuracy                           0.91        95
   macro avg       0.95      0.80      0.84        95
weighted avg       0.92      0.91      0.90        95



# Task 2: Evaluate on LIAR dataset

In [3]:
# Define column names based on the LIAR dataset description
columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 
          'party_affiliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 
          'mostly_true_counts', 'pants_on_fire_counts', 'context']
# Load the dataset combine it into one dataframe and map LIAR labels to binary labels based on the LIAR paper
test_df = pd.read_csv("liar_dataset/test.tsv", sep='\t', header=None, names=columns)
val_df = pd.read_csv("liar_dataset/valid.tsv", sep='\t', header=None, names=columns)
train_df = pd.read_csv("liar_dataset/train.tsv", sep='\t', header=None, names=columns)
liar_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
print(f"Loaded LIAR dataset with {len(liar_df)} rows")
print(f"LIAR label distribution: \n{liar_df['label'].value_counts()}")
reliable_labels = ['true', 'mostly-true', 'half-true']
fake_labels = ['barely-true', 'false', 'pants-fire']
liar_df['binary_type'] = liar_df['label'].apply(lambda x: 'reliable' if x in reliable_labels else 'fake' if x in fake_labels else np.nan)
print(f"Binary LIAR label distribution: \n{liar_df['binary_type'].value_counts()}")

X_liar_count = count_vectorizer.transform(liar_df['statement'])
X_liar_tfidf = tfidf_vectorizer.transform(liar_df['statement'])
y_liar = liar_df['binary_type']

# Evaluate baseline model
baseline_pred = baseline_model.predict(X_liar_count)
liar_baseline_f1 = f1_score(y_liar, baseline_pred, pos_label='fake')
print(f"\nBaseline Model (Logistic Regression) - F1 Score: {liar_baseline_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_liar, baseline_pred))

# Evaluate advanced model
advanced_pred = advanced_model.predict(X_liar_tfidf)
liar_advanced_f1 = f1_score(y_liar, advanced_pred, pos_label='fake')
print(f"\nAdvanced Model (Neural Network) - F1 Score: {liar_advanced_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_liar, advanced_pred))

# Create confusion matrices
cm_baseline = confusion_matrix(y_liar, baseline_pred)
cm_advanced = confusion_matrix(y_liar, advanced_pred)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', xticklabels=['reliable', 'fake'], yticklabels=['reliable', 'fake'], ax=axes[0])
axes[0].set_title('Confusion Matrix - Baseline Model (LIAR)', fontsize=14)
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('Actual', fontsize=12)
sns.heatmap(cm_advanced, annot=True, fmt='d', cmap='Blues', xticklabels=['reliable', 'fake'], yticklabels=['reliable', 'fake'], ax=axes[1])
axes[1].set_title('Confusion Matrix - Advanced Model (LIAR)', fontsize=14)
axes[1].set_xlabel('Predicted', fontsize=12)
axes[1].set_ylabel('Actual', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'Part4Task2_confusion_matrices_liar.png'))
plt.close()

Loaded LIAR dataset with 12791 rows
LIAR label distribution: 
label
half-true      2627
false          2507
mostly-true    2454
barely-true    2103
true           2053
pants-fire     1047
Name: count, dtype: int64
Binary LIAR label distribution: 
binary_type
reliable    7134
fake        5657
Name: count, dtype: int64

Baseline Model (Logistic Regression) - F1 Score: 0.6133

Classification Report:
              precision    recall  f1-score   support

        fake       0.44      1.00      0.61      5657
    reliable       0.67      0.00      0.00      7134

    accuracy                           0.44     12791
   macro avg       0.55      0.50      0.31     12791
weighted avg       0.57      0.44      0.27     12791


Advanced Model (Neural Network) - F1 Score: 0.6090

Classification Report:
              precision    recall  f1-score   support

        fake       0.45      0.96      0.61      5657
    reliable       0.62      0.05      0.09      7134

    accuracy                     

# Task 3: Compare results

In [4]:
def compare_results(fakenews_baseline_f1, fakenews_advanced_f1, liar_baseline_f1, liar_advanced_f1):
    results = {
        'Model': ['Baseline (LogReg)', 'Advanced (NeuralNet)'],
        'FakeNewsCorpus F1': [fakenews_baseline_f1, fakenews_advanced_f1],
        'LIAR F1': [liar_baseline_f1, liar_advanced_f1] if liar_baseline_f1 else ['N/A', 'N/A'],
        'Performance Drop': [
            f"{((fakenews_baseline_f1 - liar_baseline_f1) / fakenews_baseline_f1 * 100):.2f}%" if liar_baseline_f1 else 'N/A',
            f"{((fakenews_advanced_f1 - liar_advanced_f1) / fakenews_advanced_f1 * 100):.2f}%" if liar_advanced_f1 else 'N/A'
        ]
    }
    results_df = pd.DataFrame(results)
    print("\nResults Comparison Table:")
    print(results_df.to_string(index=False))
    # Create visualization
    if liar_baseline_f1 and liar_advanced_f1:
        plt.figure(figsize=(10, 6))
        # Set up grouped bar chart
        datasets = ['FakeNewsCorpus', 'LIAR']
        baseline_scores = [fakenews_baseline_f1, liar_baseline_f1]
        advanced_scores = [fakenews_advanced_f1, liar_advanced_f1]
        x = np.arange(len(datasets))
        width = 0.35
        fig, ax = plt.subplots(figsize=(10, 6))
        rects1 = ax.bar(x - width/2, baseline_scores, width, label='Baseline (LogReg)')
        rects2 = ax.bar(x + width/2, advanced_scores, width, label='Advanced (NeuralNet)')
        # Add labels and legend
        ax.set_ylabel('F1 Score')
        ax.set_title('Model Performance Comparison Across Datasets')
        ax.set_xticks(x)
        ax.set_xticklabels(datasets)
        ax.legend()
        # Add value labels on bars
        def autolabel(rects):
            for rect in rects:
                height = rect.get_height()
                ax.annotate(f'{height:.4f}', xy=(rect.get_x() + rect.get_width()/2, height), xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
        autolabel(rects1)
        autolabel(rects2)
        plt.tight_layout()
        plt.savefig(os.path.join(plots_dir, 'Part4Task3_model_comparison_across_datasets.png'))
        plt.close()

compare_results(fakenews_baseline_f1, fakenews_advanced_f1, liar_baseline_f1, liar_advanced_f1)


Results Comparison Table:
               Model  FakeNewsCorpus F1  LIAR F1 Performance Drop
   Baseline (LogReg)           0.941176 0.613274           34.84%
Advanced (NeuralNet)           0.941935 0.608953           35.35%


<Figure size 1000x600 with 0 Axes>