# Part 2: Simple Model

## Task 0

In [3]:
import re
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from cleantext import clean
import nltk
from nltk.corpus import stopwords
from functools import reduce
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import FeatureHasher

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliagrundemar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_big_cleaned = pd.read_csv('cleaned_dataset.csv') 

In [5]:
#Omitting 'unknown', 'unreliable' and 'rumor' types and dropping nan values 
df_big_cleaned = df_big_cleaned.dropna(subset=['type'])
df_big_cleaned = df_big_cleaned[df_big_cleaned['type'] != 'unknown']
df_big_cleaned = df_big_cleaned[df_big_cleaned['type'] != 'unreliable']
df_big_cleaned = df_big_cleaned[df_big_cleaned['type'] != 'rumor']

#Grouping the types 'bias','clickbait','conspiracy','fake','hate','junksci','unreliable' into 'fake'
df_big_cleaned['type'] = df_big_cleaned['type'].replace(['bias','conspiracy','fake','hate','junksci','satire'],'fake')

#Grouping the types 'political','reliable','clickbait' into 'reliable'
df_big_cleaned['type'] = df_big_cleaned['type'].replace(['political','reliable','clickbait'],'reliable')

type_distribution = df_big_cleaned['type'].value_counts()
percentage_distribution = type_distribution / type_distribution.sum() * 100
print(percentage_distribution)

type
reliable    52.238806
fake        47.761194
Name: count, dtype: float64


In [6]:
#Splitting the data into training, validation and test sets
x=df_big_cleaned.drop(columns=['type'])
y=df_big_cleaned['type']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_validation, x_test, y_validation, y_test = train_test_split(x_test, y_test, test_size=0.5,random_state=42)

## Task 1

In [7]:
import pickle 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#Training the model on the content of the articles
x_train_content = x_train['content']
x_train_content = x_train_content.fillna("nan")
x_validation_content = x_validation['content']
x_validation_content = x_validation_content.fillna("nan")

vectorizer = CountVectorizer()

x_train_content = vectorizer.fit_transform(x_train_content)
x_validation_content = vectorizer.transform(x_validation_content)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

model = LogisticRegression(solver= 'sag',max_iter=10000)
model.fit(x_train_content, y_train)

y_pred = model.predict(x_validation_content)

acc = accuracy_score(y_validation, y_pred)

print(acc)

with open('trained_model_content.pkl', 'wb') as f:
    pickle.dump(model, f)

0.7


## Task 2

In [8]:
#Training the model on the authors and content of the articles
from sklearn.feature_extraction import FeatureHasher
import pandas as pd
import numpy as np
from scipy.sparse import hstack

x_train_authors = x_train['authors']
x_val_authors = x_validation['authors']

x_train_authors = x_train_authors.fillna("nan")
x_val_authors = x_val_authors.fillna("nan")

#Checking that each entry in the 'authors' column is a string
x_train_authors = x_train_authors.apply(lambda x: [x] if isinstance(x, str) else x)
x_val_authors = x_val_authors.apply(lambda x: [x] if isinstance(x, str) else x)
#Converting to DataFrame
df_authors = pd.DataFrame({'authors': x_train_authors})
df_val_authors = pd.DataFrame({'authors': x_val_authors})

#Initializing FeatureHasher
hasher = FeatureHasher(n_features=7500, input_type='string')

#Hash encode 'authors' column
hashed_features_train_author = hasher.fit_transform(df_authors['authors'])
hashed_features_val_author = hasher.fit_transform(df_val_authors['authors'])

#Converting hashed features to dataframe
hashed_df = pd.DataFrame(hashed_features_train_author.toarray(), columns=[f'author_hash_{i}' for i in range(7500)])
hashed_df_val = pd.DataFrame(hashed_features_val_author.toarray(), columns=[f'author_hash_{i}' for i in range(7500)])

combined_train_features = hstack([x_train_content, hashed_features_train_author])
combined_val_features = hstack([x_validation_content, hashed_features_val_author])

#Initializing logistic regression model
model2 = LogisticRegression(max_iter=2000)

model2.fit(combined_train_features, y_train)

#Predicting on the test set
y_pred = model2.predict(combined_val_features)

#Evaluating performance 
accuracy = accuracy_score(y_validation, y_pred)
print("Accuracy:", accuracy)

with open('trained_model2.pkl', 'wb') as f:
    pickle.dump(model2, f)

Accuracy: 0.775


In [9]:
#Training the model on the domains and contents of the articles 

#Hasing over domains
x_train_domain = x_train['domain']
x_val_domain = x_validation['domain']
x_train_domain = x_train_domain.fillna("nan")
x_val_domain = x_val_domain.fillna("nan")

#Concatenating domain data from both training and validation datasets
combined_domains = pd.concat([x_train_domain, x_val_domain], ignore_index=True)

#Converting domain data to list of lists
combined_domains = combined_domains.apply(lambda x: [x] if isinstance(x, str) else x).tolist()

#Initializing FeatureHasher
hasher = FeatureHasher(n_features=500, input_type='string')

#Transforming combined domain data
hashed_features = hasher.fit_transform(combined_domains)

#Splitting hashed features back into training and validation parts
hashed_features_train = hashed_features[:len(x_train_domain)]
hashed_features_val = hashed_features[len(x_train_domain):]

#Combining hashed features with content features
combined_train_features = hstack([x_train_content, hashed_features_train])
combined_val_features = hstack([x_validation_content, hashed_features_val])

#Initializing logistic regression model
model3 = LogisticRegression(max_iter=2000)

model3.fit(combined_train_features, y_train)

#Predicting on the validation set
y_pred = model3.predict(combined_val_features)

#Evaluating performance
accuracy = accuracy_score(y_validation, y_pred)
print("Accuracy:", accuracy)

with open('trained_model3.pkl', 'wb') as f:
    pickle.dump(model3, f)

Accuracy: 0.7875


In [10]:
#Training the model on contents, authors and domains of the articles

#Combining hashed domain and author features with content features
combined_train_features = hstack([x_train_content, hashed_features_train, hashed_features_train_author])
combined_val_features = hstack([x_validation_content, hashed_features_val, hashed_features_val_author])

#Initializing logistic regression model
model4 = LogisticRegression(solver= 'sag', max_iter=10000)

model4.fit(combined_train_features, y_train)

#Predicting on the validation set
y_pred = model4.predict(combined_val_features)

#Evaluating performance
accuracy = accuracy_score(y_validation, y_pred)
print("Accuracy:", accuracy)

with open('trained_model4.pkl', 'wb') as f:
    pickle.dump(model4, f)

Accuracy: 0.7625


## Task 3

In [11]:
#Adding the extra reliable data to the dataset
reliable = pd.read_csv('reliable_scraped_data.csv')
reliable['type'] = 'reliable'

print(reliable.shape)
concatenated_data = pd.concat([df_big_cleaned,reliable],axis=0)

x=concatenated_data.drop(columns=['type'])
y=concatenated_data['type']
x_train_concat, x_test_concat, y_train_concat, y_test_concat = train_test_split(x,y, test_size=0.2, random_state=42)
x_validation_concat, x_test_concat, y_validation_concat, y_test_concat = train_test_split(x_test_concat, y_test_concat, test_size=0.5,random_state=42)

(4782, 5)


In [12]:
#Training the model with the extra reliable data on the content of the articles
x_train_concat_content = x_train_concat['content']
x_train_concat_content = x_train_concat_content.fillna("nan")
x_validation_concat_content = x_validation_concat['content']
x_validation_concat_content = x_validation_concat_content.fillna("nan")

vectorizer_concat = CountVectorizer()

x_train_concat_content = vectorizer_concat.fit_transform(x_train_concat_content)
x_validation_concat_content = vectorizer_concat.transform(x_validation_concat_content)

with open('vectorizer_concat.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

#Initializing logistic regression model
model5 = LogisticRegression(solver= 'sag',max_iter=10000)
model5.fit(x_train_concat_content, y_train_concat)

#Predicting on the validation set
y_pred_concat = model5.predict(x_validation_concat_content)

#Evaluating performance
acc = accuracy_score(y_validation_concat, y_pred_concat)

print(acc)

with open('trained_model_content_concat.pkl', 'wb') as f:
    pickle.dump(model5, f)

0.9445438282647585
