# Part 2

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.sparse import hstack
from sklearn.preprocessing import OneHotEncoder
import warnings
import os

warnings.filterwarnings("ignore", category=UserWarning)

train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('validation_data.csv')
test_df = pd.read_csv('test_data.csv')

plots_dir = os.path.join(os.getcwd(), 'plots')

# Display distribution of types
# print("\nOriginal distribution of news types:")
# print(train_df['type'].value_counts())
# print("\nBinary class distribution (Training set):")
# print(train_df['binary_type'].value_counts())
# print("\nBinary class distribution (Validation set):")
# print(val_df['binary_type'].value_counts())
# print("\nBinary class distribution (Test set):")
# print(test_df['binary_type'].value_counts())

# Task 1: Implement simple logistic regression

In [2]:
# Create feature matrix with the 10,000 most frequent words and get target labels
count_vectorizer = CountVectorizer(max_features=10000)
X_train_count = count_vectorizer.fit_transform(train_df['cleaned_content'])
X_val_count = count_vectorizer.transform(val_df['cleaned_content'])
X_test_count = count_vectorizer.transform(test_df['cleaned_content'])
y_train = train_df['binary_type']
y_val = val_df['binary_type']
y_test = test_df['binary_type']

# Calculate class weights to handle imbalance
class_weights = {
    'fake': 1.0,
    'reliable': len(train_df[train_df['binary_type'] == 'fake']) / len(train_df[train_df['binary_type'] == 'reliable'])
}

# Train logistic regression model with class weights
baseline_model = LogisticRegression(max_iter=1000, random_state=42, class_weight=class_weights)
baseline_model.fit(X_train_count, y_train)

# Evaluate on validation set
baseline_val_pred = baseline_model.predict(X_val_count)
baseline_val_f1 = f1_score(y_val, baseline_val_pred, pos_label='fake')
print(f"\nBaseline model performance on validation set:")
print(f"F1 Score: {baseline_val_f1:.4f}")
print("\nDetailed classification report:")
print(classification_report(y_val, baseline_val_pred))

# Create confusion matrix
cm_baseline = confusion_matrix(y_val, baseline_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues', xticklabels=['reliable', 'fake'], yticklabels=['reliable', 'fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Baseline Model')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'Part2Task1_baseline_confusion_matrix.png'))
plt.close()


Baseline model performance on validation set:
F1 Score: 0.9517

Detailed classification report:
              precision    recall  f1-score   support

        fake       0.96      0.95      0.95        73
    reliable       0.83      0.86      0.84        22

    accuracy                           0.93        95
   macro avg       0.89      0.90      0.90        95
weighted avg       0.93      0.93      0.93        95



# Task 2: Inclusion of meta-data

In [7]:
# One-hot encode domains
domain_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
train_domains = domain_encoder.fit_transform(train_df[['domain']])
val_domains = domain_encoder.transform(val_df[['domain']])

# Combine text features with domain features
X_train_with_meta = hstack([X_train_count, train_domains])
X_val_with_meta = hstack([X_val_count, val_domains])

# Train model with metadata
meta_model = LogisticRegression(max_iter=1000, random_state=42, class_weight=class_weights)
meta_model.fit(X_train_with_meta, y_train)

# Evaluate
meta_val_pred = meta_model.predict(X_val_with_meta)
meta_val_f1 = f1_score(y_val, meta_val_pred, pos_label='fake')
print(f"\nModel with metadata performance on validation set:")
print(f"F1 Score: {meta_val_f1:.4f}")
print("\nDetailed classification report:")
print(classification_report(y_val, meta_val_pred))

# Create confusion matrix
cm_meta = confusion_matrix(y_val, meta_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_meta, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['reliable', 'fake'], 
            yticklabels=['reliable', 'fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Model with Metadata')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'Part2Task2_metadata_confusion_matrix.png'))
plt.close()


Model with metadata performance on validation set:
F1 Score: 0.9733

Detailed classification report:
              precision    recall  f1-score   support

        fake       0.95      1.00      0.97        73
    reliable       1.00      0.82      0.90        22

    accuracy                           0.96        95
   macro avg       0.97      0.91      0.94        95
weighted avg       0.96      0.96      0.96        95



# Task 3: Scraped articles from assignment 2

In [6]:
with open('bbc_articles_content.json', 'r', encoding='utf-8') as f:
    bbc_data = json.load(f)
bbc_df = pd.DataFrame(bbc_data)
# Keep only articles with content
bbc_df = bbc_df[bbc_df['text'].str.len() > 0].copy()
# Clean the text using the same function from Part 1
def clean_text(text):
    """Clean text using regular expressions"""
    if not isinstance(text, str):
        return ""
    text = text.lower() # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text) # Replace URLs
    text = re.sub(r'\S+@\S+', '<EMAIL>', text) # Replace emails
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', '<DATE>', text) # Replace dates
    text = re.sub(r'\d{1,2}-\d{1,2}-\d{2,4}', '<DATE>', text) # Replace numbers
    text = re.sub(r'\b\d+\b', '<NUM>', text)
    text = re.sub(r'\s+', ' ', text) # Remove excess whitespace
    text = text.strip()
    return text

bbc_df['cleaned_content'] = bbc_df['text'].apply(clean_text) # Clean BBC content
bbc_df['type'] = 'reliable' # Add 'type' column
bbc_df['binary_type'] = 'reliable' # Add 'binary_type' column
bbc_subset = bbc_df[['cleaned_content', 'type', 'binary_type']].copy() # Select relevant columns for combining with training data
combined_train_df = pd.concat([train_df, bbc_subset], ignore_index=True) # Combine BBC data with training data
# print(f"Added {len(bbc_subset)} BBC articles to the training data")
# print(f"New training data distribution:")
# print(combined_train_df['binary_type'].value_counts())
# Create features for combined dataset
X_combined_train = count_vectorizer.transform(combined_train_df['cleaned_content'])
y_combined_train = combined_train_df['binary_type']
# Recalculate class weights
combined_class_weights = {
    'fake': 1.0,
    'reliable': len(combined_train_df[combined_train_df['binary_type'] == 'fake']) / 
                len(combined_train_df[combined_train_df['binary_type'] == 'reliable'])
}
# Train model with combined data
combined_model = LogisticRegression(max_iter=1000, random_state=42,class_weight=combined_class_weights)
combined_model.fit(X_combined_train, y_combined_train)

# Evaluate on validation set
combined_val_pred = combined_model.predict(X_val_count)
combined_val_f1 = f1_score(y_val, combined_val_pred, pos_label='fake')
print(f"\nModel with BBC data performance on validation set:")
print(f"F1 Score: {combined_val_f1:.4f}")
print("\nDetailed classification report:")
print(classification_report(y_val, combined_val_pred))
# Create confusion matrix
cm_combined = confusion_matrix(y_val, combined_val_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_combined, annot=True, fmt='d', cmap='Blues', xticklabels=['reliable', 'fake'], yticklabels=['reliable', 'fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Model with BBC Data')
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'Part2Task3_bbc_confusion_matrix.png'))
plt.close()

# Compare all models
models = ['Baseline', 'With Metadata', 'With BBC Data']
f1_scores = [baseline_val_f1, meta_val_f1, combined_val_f1]
plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=f1_scores)
plt.title('F1 Score Comparison')
plt.ylabel('F1 Score')
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig(os.path.join(plots_dir, 'Part2Task3_model_comparison.png'))
plt.close()


Model with BBC data performance on validation set:
F1 Score: 0.9379

Detailed classification report:
              precision    recall  f1-score   support

        fake       0.94      0.93      0.94        73
    reliable       0.78      0.82      0.80        22

    accuracy                           0.91        95
   macro avg       0.86      0.87      0.87        95
weighted avg       0.91      0.91      0.91        95

