# Part 2: Simple Model

## Task 0

In [None]:
import re
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from cleantext import clean
import nltk
from nltk.corpus import stopwords
from functools import reduce
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher

In [None]:
df_big_cleaned = pd.read_csv('cleaned_dataset.csv') 

In [None]:
#Omitting 'unknown', 'unreliable' and 'rumor' types and dropping nan values 
df_big_cleaned = df_big_cleaned.dropna(subset=['type'])
df_big_cleaned = df_big_cleaned[df_big_cleaned['type'] != 'unknown']
df_big_cleaned = df_big_cleaned[df_big_cleaned['type'] != 'unreliable']
df_big_cleaned = df_big_cleaned[df_big_cleaned['type'] != 'rumor']

#Grouping the types 'bias','clickbait','conspiracy','fake','hate','junksci','unreliable' into 'fake'
df_big_cleaned['type'] = df_big_cleaned['type'].replace(['bias','conspiracy','fake','hate','junksci','satire'],'fake')

#Grouping the types 'political','reliable','clickbait' into 'reliable'
df_big_cleaned['type'] = df_big_cleaned['type'].replace(['political','reliable','clickbait'],'reliable')

type_distribution = df_big_cleaned['type'].value_counts()
percentage_distribution = type_distribution / type_distribution.sum() * 100
print(percentage_distribution)

In [None]:
#Splitting the data into training, validation and test sets
x=df_big_cleaned.drop(columns=['type'])
y=df_big_cleaned['type']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_validation, x_test, y_validation, y_test = train_test_split(x_test, y_test, test_size=0.5,random_state=42)

## Task 1

In [None]:
import pickle 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#Training the model on the content of the articles
x_train_content = x_train['content']
x_train_content = x_train_content.fillna("nan")
x_validation_content = x_validation['content']
x_validation_content = x_validation_content.fillna("nan")

vectorizer = CountVectorizer()

x_train_content = vectorizer.fit_transform(x_train_content)
x_validation_content = vectorizer.transform(x_validation_content)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

model = LogisticRegression(solver= 'sag',max_iter=10000)
model.fit(x_train_content, y_train)

y_pred = model.predict(x_validation_content)

acc = accuracy_score(y_validation, y_pred)

print(acc)

with open('trained_model_content.pkl', 'wb') as f:
    pickle.dump(model, f)

## Task 2

In [None]:
#Training the model on the authors and content of the articles
from sklearn.feature_extraction import FeatureHasher
import pandas as pd
import numpy as np
from scipy.sparse import hstack

x_train_authors = x_train['authors']
x_val_authors = x_validation['authors']

x_train_authors = x_train_authors.fillna("nan")
x_val_authors = x_val_authors.fillna("nan")

#Checking that each entry in the 'authors' column is a string
x_train_authors = x_train_authors.apply(lambda x: [x] if isinstance(x, str) else x)
x_val_authors = x_val_authors.apply(lambda x: [x] if isinstance(x, str) else x)
#Converting to DataFrame
df_authors = pd.DataFrame({'authors': x_train_authors})
df_val_authors = pd.DataFrame({'authors': x_val_authors})

#Initializing FeatureHasher
hasher = FeatureHasher(n_features=7500, input_type='string')

#Hash encode 'authors' column
hashed_features_train_author = hasher.fit_transform(df_authors['authors'])
hashed_features_val_author = hasher.fit_transform(df_val_authors['authors'])

#Converting hashed features to dataframe
hashed_df = pd.DataFrame(hashed_features_train_author.toarray(), columns=[f'author_hash_{i}' for i in range(7500)])
hashed_df_val = pd.DataFrame(hashed_features_val_author.toarray(), columns=[f'author_hash_{i}' for i in range(7500)])

combined_train_features = hstack([x_train_content, hashed_features_train_author])
combined_val_features = hstack([x_validation_content, hashed_features_val_author])

#Initializing logistic regression model
model2 = LogisticRegression(max_iter=2000)

model2.fit(combined_train_features, y_train)

#Predicting on the test set
y_pred = model2.predict(combined_val_features)

#Evaluating performance 
accuracy = accuracy_score(y_validation, y_pred)
print("Accuracy:", accuracy)

with open('trained_model2.pkl', 'wb') as f:
    pickle.dump(model2, f)

## Task 3

In [None]:
#Adding the extra reliable data to the dataset
reliable = pd.read_csv('reliable_scraped_data.csv')
reliable['type'] = 'reliable'

print(reliable.shape)
concatenated_data = pd.concat([df_big_cleaned,reliable],axis=0)

x=concatenated_data.drop(columns=['type'])
y=concatenated_data['type']
x_train_concat, x_test_concat, y_train_concat, y_test_concat = train_test_split(x,y, test_size=0.2, random_state=42)
x_validation_concat, x_test_concat, y_validation_concat, y_test_concat = train_test_split(x_test_concat, y_test_concat, test_size=0.5,random_state=42)

In [None]:
#Training the model with the extra reliable data on the content of the articles
x_train_concat_content = x_train_concat['content']
x_train_concat_content = x_train_concat_content.fillna("nan")
x_validation_concat_content = x_validation_concat['content']
x_validation_concat_content = x_validation_concat_content.fillna("nan")
x_test_concat_content = x_test_concat['content']
x_test_concat_content = x_test_concat_content.fillna("nan")

vectorizer_concat = CountVectorizer()

x_train_concat_content = vectorizer_concat.fit_transform(x_train_concat_content)
x_validation_concat_content = vectorizer_concat.transform(x_validation_concat_content)
x_test_concat_content = vectorizer_concat.transform(x_test_concat_content)

with open('vectorizer_concat.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

#Initializing logistic regression model
model5 = LogisticRegression(solver= 'sag',max_iter=10000)
model5.fit(x_train_concat_content, y_train_concat)

#Predicting on the validation set
y_pred_concat = model5.predict(x_validation_concat_content)

#Evaluating performance
acc = accuracy_score(y_validation_concat, y_pred_concat)
print(acc)

with open('trained_model_content_concat.pkl', 'wb') as f:
    pickle.dump(model5, f)

# Part 4: Evaluation of the simple model 

In [None]:
#Function to make confusion matrix
import seaborn as sns
def make_confusion_matrix(cf, group_names=None, categories='auto', count=True, percent=True, cbar=True, xyticks=True,
                          xyplotlabels=True, sum_stats=True, figsize=None, cmap='Blues', title=None):

    # Function to generate text inside each square
    def generate_labels(cf, count, percent):
        blanks = ['' for _ in range(cf.size)]
        if group_names and len(group_names) == cf.size:
            group_labels = ["{}\n".format(value) for value in group_names]
        else:
            group_labels = blanks

        if count:
            group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
        else:
            group_counts = blanks

        if percent:
            group_percentages = ["{0:.2%}".format(value) for value in cf.flatten() / np.sum(cf)]
        else:
            group_percentages = blanks

        box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in
                      zip(group_labels, group_counts, group_percentages)]
        return np.asarray(box_labels).reshape(cf.shape[0], cf.shape[1])

    #Generating labels
    box_labels = generate_labels(cf, count, percent)

    #Setting figure parameters according to other arguments
    if figsize is None:
        figsize = plt.rcParams.get('figure.figsize')

    if not xyticks:
        categories = False

    colors = ['Reds', 'Greens']

    #Make the heatmap visualization
    plt.figure(figsize=figsize)
    sns.heatmap(cf, annot=box_labels, fmt="", cmap=cmap, cbar=cbar, xticklabels=categories, yticklabels=categories, 
                mask=cf == 0)  # Mask zeros to avoid displaying empty cells

    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    if title:
        plt.title(title)

    plt.show() 

In [None]:
#Calculating accuracy on the test set and generating an f1-score and confusion matrix
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import pickle

#Evaluating on the test set 
y_pred_test = model5.predict(x_test_concat_content)
acc_test = accuracy_score(y_test_concat, y_pred_test)
print('accuracy:', acc_test)
# Calculate F1-score
f1 = f1_score(y_test_concat, y_pred_concat, pos_label='reliable')
print("F1-score:", f1)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test_concat, y_pred_concat)
print("Confusion Matrix:\n", conf_matrix) 
labels = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Fake', 'Real']

make_confusion_matrix(conf_matrix, group_names=labels, categories=categories, cmap='Blues')