In [1]:
import os
import random
import pandas as pd
import numpy as np 

In [2]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import classification_report

<h3>Importing the dataset</h3>

In [3]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
result_dir = os.path.join(cwd,'Results')

In [4]:
df = pd.read_csv(os.path.join(dataset_dir,'train_news_preprocessed_bf.csv'), low_memory=False, 
                 usecols = ['label','clean_news_tokens'])
# 'headline','news','headline_len','news_len','caps_in_headline','caps_in_news', 'clean_headline_tokens'

<h3>Setting environment variables</h3>

In [5]:
RANDOM_STATE = 1973
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
pd.options.display.max_seq_items = 20
pd.options.display.max_rows = 50

<h3>Starting model implementation</h3>

In [6]:
X = df['clean_news_tokens'].apply(lambda x: ' '.join(eval(x)))  # Concatenate tokens into space-separated strings
y = df['label']  # Target variable (fake or not fake)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [8]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_valid_tfidf = vectorizer.transform(X_valid)

In [9]:
Iters = 100
parallel_workers = 9
cross_val_works = 5
verbose = 4

In [10]:
scorers = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

In [11]:
def model_logging(modelname, Iters, random_search):
    ID_lst = [f'{modelname}-{str(j).zfill(4)}' for j in range(1, 1+Iters)]
    val = random_search.cv_results_

    para_df = pd.DataFrame.from_dict(val['params'])
    para_df['ID'] = ID_lst
    para_df.to_csv(os.path.join(result_dir, f'{modelname}_Parameters.csv'))

    result_train_df = pd.DataFrame()
    result_train_df['ID'] = ID_lst
    result_train_df['Accuracy'] = list(val['mean_train_accuracy'])
    result_train_df['F1-Score'] = list(val['mean_train_f1'])
    result_train_df['Recall'] = list(val['mean_train_recall'])
    result_train_df['Precision'] = list(val['mean_train_precision'])
    result_train_df['Time (Sec)'] = list(val['mean_score_time'])
    result_train_df.to_csv(os.path.join(result_dir, f'{modelname}_Train_result.csv'))
    
    result_test_df = pd.DataFrame()
    result_test_df['ID'] = ID_lst
    result_test_df['Accuracy'] = list(val['mean_test_accuracy'])
    result_test_df['F1-Score'] = list(val['mean_test_f1'])
    result_test_df['Recall'] = list(val['mean_test_recall'])
    result_test_df['Precision'] = list(val['mean_test_precision'])
    result_test_df['Time (Sec)'] = list(val['mean_fit_time'])
    result_test_df.to_csv(os.path.join(result_dir, f'{modelname}_Test_result.csv'))

In [12]:
def model_logging_valid(modelname, random_search, X_valid_tfidf, y_valid):
    y_pred = random_search.predict(X_valid_tfidf)
    report = classification_report(y_valid, y_pred, digits=5)
    
    with open(os.path.join(result_dir, f'{modelname}_Valid_report.txt'), 'w') as f:
        f.write(report)

<h3>Naive Bayes</h3>

In [13]:
MNB_para = {
    'alpha': np.arange(1e-2, 2, 1e-2),
    'fit_prior': [True, False]
    #'max_iter': [1000],
}

In [14]:
random_search = RandomizedSearchCV(
    estimator=MultinomialNB(),
    param_distributions=MNB_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=scorers,
    refit='accuracy',
    return_train_score=True,
    verbose=verbose
)

In [15]:
random_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [16]:
model_logging('MNB',Iters,random_search)
model_logging_valid('MNB',random_search,X_valid_tfidf,y_valid)

<h3>Logistic Regression</h3>

In [17]:
LR_para = {
    'C': np.arange(1e-2, 1e2, 1e-2),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000],
    'random_state': [RANDOM_STATE]
}

In [18]:
random_search = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=LR_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=scorers,
    refit='accuracy',
    return_train_score=True,
    verbose=verbose
)

In [19]:
random_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [20]:
model_logging('LR',Iters,random_search)
model_logging_valid('LR',random_search,X_valid_tfidf,y_valid)

<h3>Gradient Boosting</h3>

In [21]:
GBC_para = {
    'n_estimators': range(100, 551, 50),
    'criterion': ['friedman_mse'],
    'learning_rate': np.arange(1e-2, 0.1, 1e-2),
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None] + list(range(1, 100)),
    'min_samples_split': list(np.arange(0.1, 1, 0.1)) + list(range(2,100)),
    'min_samples_leaf': list(np.arange(0.1, 0.5, 0.1)) + list(range(1,100)),
    #'max_iter': [1000],
    'random_state': [RANDOM_STATE]
}

In [22]:
random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(),
    param_distributions=GBC_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=scorers,
    refit='accuracy',
    return_train_score=True,
    verbose=verbose
)

In [23]:
random_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [24]:
model_logging('GBC',Iters,random_search)
model_logging_valid('GBC',random_search,X_valid_tfidf,y_valid)

<h3>Random Forest</h3>

In [25]:
RFC_para = {
    'n_estimators': range(100, 551, 50),
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None] + list(range(1, 100)),
    'min_samples_split': list(np.arange(0.1, 1, 0.1)) + list(range(2,100)),
    'min_samples_leaf': list(np.arange(0.1, 0.5, 0.1)) + list(range(1,100)),
    'random_state': [RANDOM_STATE]
}

In [26]:
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=RFC_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=scorers,
    refit='accuracy',
    return_train_score=True,
    verbose=verbose
)

In [27]:
random_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [28]:
model_logging('RFC',Iters,random_search)
model_logging_valid('RFC',random_search,X_valid_tfidf,y_valid)

<h3>SVM</h3>

In [29]:
SVM_para = {
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': range(1, 6),
    'C': np.arange(1e-2, 10, 1e-2),
    'gamma': ['scale', 'auto'] + list(np.arange(1e-2, 10, 1e-2)),
    'random_state': [RANDOM_STATE]
}

In [30]:
random_search = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=SVM_para,
    n_iter=Iters,
    n_jobs=parallel_workers,
    cv=cross_val_works,
    scoring=scorers,
    refit='accuracy',
    return_train_score=True,
    verbose=verbose
)

In [31]:
random_search.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [32]:
model_logging('SVM',Iters,random_search)
model_logging_valid('SVM',random_search,X_valid_tfidf,y_valid)