In [None]:
import sys

In [None]:
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install scikit-learn

In [None]:
import os
import random
import pandas as pd
import numpy as np 

In [None]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

<h3>Importing the dataset</h3>

In [None]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
result_dir = os.path.join(cwd,'Results')

In [None]:
df = pd.read_csv(os.path.join(dataset_dir,'train_news_preprocessed.csv'), low_memory=False, 
                 usecols = ['label','clean_news_tokens','clean_headline_tokens'])
# 'headline','news','headline_len','news_len','caps_in_headline','caps_in_news',

<h3>Setting environment variables</h3>

In [None]:
RANDOM_STATE = 1973
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
pd.options.display.max_seq_items = 20
pd.options.display.max_rows = 50

<h3>Starting model implementation</h3>

In [None]:
X = df['clean_news_tokens'].apply(lambda x: ' '.join(eval(x)))  # Concatenate tokens into space-separated strings
y = df['label']  # Target variable (fake or not fake)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
Iters = 100
parallel_workers = -1
cross_val_works = 5
verbose = 10

<h3>Logistic Regression</h3>

In [None]:
LR_para = {
    'C': np.arange(1e-2, 1e2, 1e-2),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000],
    'random_state': [RANDOM_STATE]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=LR_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=['accuracy', 'f1', 'recall', 'precision'],
    refit=True,
    return_train_score=True,
    verbose=verbose
)

In [None]:
random_search.fit(X_train_tfidf, y_train)

In [None]:
ID_lst = [f'LR-{str(j).zfill(4)}' for j in range(1, 1+Iters)]
val = random_search.cv_results_

In [None]:
para_df = pd.DataFrame.from_dict(val['params'])
para_df['ID'] = ID_lst
para_df.to_csv(os.path.join(result_dir,'LR_parameters.csv'))

In [None]:
result_test_df = pd.DataFrame()
result_test_df['ID'] = ID_lst
result_test_df['Accuracy'] = list(val['mean_train_accuracy'])
result_test_df['F1-Score'] = list(val['mean_train_f1'])
result_test_df['Recall'] = list(val['mean_train_recall'])
result_test_df['Precision'] = list(val['mean_train_precision'])
result_test_df['Time (Sec)'] = list(val['mean_fit_time'])
result_test_df.to_csv(os.path.join(result_dir, 'LR_Train_result.csv'))

In [None]:
y_pred = random_search.predict(X_test_tfidf)
report = classification_report(y_test, y_pred)

In [None]:
with open(os.path.join(result_dir, 'LR_Test_report.txt'), 'w') as f:
    f.write(report)

<h3>SVC</h3>

In [None]:
SVC_para = {
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': range(1, 6),
    'C': np.arange(1e-2, 10, 1e-2),
    'gamma': ['scale', 'auto'] + list(np.arange(1e-2, 10, 1e-2)),
    'random_state': [RANDOM_STATE]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=SVC_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=['accuracy', 'f1', 'recall', 'precision'],
    refit=True,
    return_train_score=True,
    verbose=verbose
)

In [None]:
random_search.fit(X_train_tfidf, y_train)

In [None]:
ID_lst = [f'SVC-{str(j).zfill(4)}' for j in range(1, 1+Iters)]
val = random_search.cv_results_

In [None]:
para_df = pd.DataFrame.from_dict(val['params'])
para_df['ID'] = ID_lst
para_df.to_csv(os.path.join(result_dir,'SVM_parameters.csv'))

In [None]:
result_test_df = pd.DataFrame()
result_test_df['ID'] = ID_lst
result_test_df['Accuracy'] = list(val['mean_train_accuracy'])
result_test_df['F1-Score'] = list(val['mean_train_f1'])
result_test_df['Recall'] = list(val['mean_train_recall'])
result_test_df['Precision'] = list(val['mean_train_precision'])
result_test_df['Time (Sec)'] = list(val['mean_fit_time'])
result_test_df.to_csv(os.path.join(result_dir, 'SVM_Train_result.csv'))

In [None]:
y_pred = random_search.predict(X_test_tfidf)
report = classification_report(y_test, y_pred)

In [None]:
with open(os.path.join(result_dir, 'SVM_Test_report.txt'), 'w') as f:
    f.write(report)

<h3>Naive Bayes</h3>

In [None]:
MNB_para = {
    'alpha': np.arange(1e-2, 2, 1e-2),
    'fit_prior': [True, False]
    #'max_iter': [1000],
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=MultinomialNB(),
    param_distributions=MNB_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=['accuracy', 'f1', 'recall', 'precision'],
    refit=True,
    return_train_score=True,
    verbose=verbose
)

In [None]:
random_search.fit(X_train_tfidf, y_train)

In [None]:
ID_lst = [f'MNB-{str(j).zfill(4)}' for j in range(1, 1+Iters)]
val = random_search.cv_results_

In [None]:
para_df = pd.DataFrame.from_dict(val['params'])
para_df['ID'] = ID_lst
para_df.to_csv(os.path.join(result_dir,'MNB_parameters.csv'))

In [None]:
result_test_df = pd.DataFrame()
result_test_df['ID'] = ID_lst
result_test_df['Accuracy'] = list(val['mean_train_accuracy'])
result_test_df['F1-Score'] = list(val['mean_train_f1'])
result_test_df['Recall'] = list(val['mean_train_recall'])
result_test_df['Precision'] = list(val['mean_train_precision'])
result_test_df['Time (Sec)'] = list(val['mean_fit_time'])
result_test_df.to_csv(os.path.join(result_dir, 'MNB_Train_result.csv'))

In [None]:
y_pred = random_search.predict(X_test_tfidf)
report = classification_report(y_test, y_pred)

In [None]:
with open(os.path.join(result_dir, 'MNB_Test_report.txt'), 'w') as f:
    f.write(report)

<h3>Random Forest</h3>

In [None]:
RFC_para = {
    'n_estimators': range(100, 551, 50),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [None] + list(range(1, 100)),
    'min_samples_split': list(np.arange(0, 1, 0.1)) + list(range(2,100)),
    'min_samples_leaf': list(np.arange(0, 1, 0.1)) + list(range(1,100)),
    'random_state': [RANDOM_STATE]
}

In [None]:
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=SVC_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=['accuracy', 'f1', 'recall', 'precision'],
    refit=True,
    return_train_score=True,
    verbose=verbose
)

In [None]:
random_search.fit(X_train_tfidf, y_train)

In [None]:
ID_lst = [f'RFC-{str(j).zfill(4)}' for j in range(1, 1+Iters)]
val = random_search.cv_results_

In [None]:
para_df = pd.DataFrame.from_dict(val['params'])
para_df['ID'] = ID_lst
para_df.to_csv(os.path.join(result_dir,'RFC_parameters.csv'))

In [None]:
result_test_df = pd.DataFrame()
result_test_df['ID'] = ID_lst
result_test_df['Accuracy'] = list(val['mean_train_accuracy'])
result_test_df['F1-Score'] = list(val['mean_train_f1'])
result_test_df['Recall'] = list(val['mean_train_recall'])
result_test_df['Precision'] = list(val['mean_train_precision'])
result_test_df['Time (Sec)'] = list(val['mean_fit_time'])
result_test_df.to_csv(os.path.join(result_dir, 'RFC_Train_result.csv'))

In [None]:
y_pred = random_search.predict(X_test_tfidf)
report = classification_report(y_test, y_pred)

In [None]:
with open(os.path.join(result_dir, 'RFC_Test_report.txt'), 'w') as f:
    f.write(report)