# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Check GPU Resources

In [None]:
!nvidia-smi

#Check CPU Resources

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

#Install AutoTime to Measure the runtime

In [None]:
!pip install ipython-autotime hypopt
%load_ext autotime

# Import Libraries

In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, KFold, cross_val_score, RandomizedSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import SGDClassifier, LogisticRegression
from joblib import dump, load
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
import numpy as np
import itertools

from time import time

# Common Methods

## Plot The Confusion Matrix

In [None]:
## https://github.com/parthpatwa/covid19-fake-news-detection/blob/main/ml_baseline-test.ipynb
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(5, 4))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    #plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
stops = set(stopwords.words("english"))
def cleantext(string):
    text = string.lower().split()
    text = " ".join(text)
    text = re.sub(r"http(\S)+",' ',text)
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text)
    tx = text.replace('&amp',' ')
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text)
    text = text.split()
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Import Dataset and Define the Random State

In [None]:
# Dataset can be obtained from

PATH = '/content/drive/MyDrive/UNS/dataset/'
train = pd.read_csv(PATH + 'Constraint_Train.csv')
val = pd.read_csv(PATH + 'Constraint_Val.csv')
test = pd.read_csv(PATH + 'english_test_with_labels.csv')

RANDOM_STATE = 21

In [None]:
train['tweet'] = train['tweet'].map(lambda x: cleantext(x))
train ['label'] = train['label'].map(lambda x: cleantext(x))
val['tweet'] = val['tweet'].map(lambda x: cleantext(x))
val['label'] = val['label'].map(lambda x: cleantext(x))
test['tweet'] = test['tweet'].map(lambda x: cleantext(x))
test ['label'] = test['label'].map(lambda x: cleantext(x))
new_train = train['label']

In [None]:
print(train['label'])
print(train['tweet'])

In [None]:
labels = []
for name in train['label'].values:
    #print (name)
    #temp = r["Label"]
    if  name == "fake" :
        labels.append(0)
    elif name == "real" :
        labels.append(1)
new_train_label = labels
#print (len(labels[labels == 1]))
a = np.where(np.array(labels) == 1)
b = np.array(labels)
c = b[b==1]
d = b[b==0]
#print (a)
#print (labels[a])
#len(labels)
print (len(c))
print (len(d))

In [None]:
a= train['label'].value_counts()
a_species = a.index
a_count = a.values

b= val['label'].value_counts()
b_species = b.index
b_count = b.values

c= test['label'].value_counts()
c_species = c.index
c_count = c.values

# set width of bar
barWidth = 0.25
fig = plt.subplots(figsize =(12, 8))

# Set position of bar on X axis
br1 = np.arange(len(a_count))
br2 = [x + barWidth for x in br1]
br3 = [x + barWidth for x in br2]

# Make the plot
plt.bar(br1, a_count/100, fill=False, width = barWidth,
        edgecolor ='grey', hatch='x', label='Training Data')
plt.bar(br2, b_count/100, fill=False, width = barWidth,
        edgecolor ='grey', hatch='+', label='Validation Data')
plt.bar(br3, c_count/100, fill=False, width = barWidth,
        edgecolor ='grey', hatch='o', label='Testing Data')

# Adding Xticks
plt.xlabel('Social Media News', fontsize = 16)
plt.ylabel('Count (x 100)', fontsize = 16)
plt.xticks([r + barWidth for r in range(len(a_count))],
        ['Real', 'Fake'], fontsize = 14)
plt.yticks(fontsize=14)
plt.title('Dataset Composition', fontweight ='bold', fontsize=20)

plt.legend(fontsize=16)
plt.show()

##**Define Metrices**

In [None]:
def print_metrices(pred,true):
    accuracy = accuracy_score(pred,true)
    precision = precision_score(pred, true, average = 'weighted')
    recall = recall_score(pred,true, average = 'weighted')
    f1 = f1_score(pred, true, average = 'weighted')
    print("Accuracy : ",accuracy)
    print("Precison : ",precision)
    print("Recall : ",recall)
    print("F1 : ",f1)
    return accuracy, precision, recall, f1

# Original Baseline for Training, Validation, and Testing

## Method for Training and Validation

In [None]:
# This method is adapted from the original source-code on:
# https://github.com/parthpatwa/covid19-fake-news-detection/blob/main/ml_baseline.ipynb
def train_val_baseline(pipeline, method_title):
  print(f'Training and Validation of {method_title}')

  # get the starting time
  t0 = time()

  # fit the model with the data
  fit = pipeline.fit(train['tweet'],train['label'])

  pred=pipeline.predict(val['tweet'])
  print(f'Time: {time() - t0} \nPerformance score:')

  # display the elapsed time and performance scores
  print_metrices(pred, val['label'])
  plot_confusion_matrix(confusion_matrix(val['label'], pred),
                        target_names=['fake','real'],
                        normalize = False,
                        title = f'Confusion matix of {method_title} on val data')

  val_ori = pd.read_csv(PATH + 'Constraint_Val.csv')
  val_misclass_df = val_ori[pred!=val['label']]
  val_misclass_df.info()
  val_misclass_df.to_csv(PATH + f'{method_title}_val_misclassified.csv', index=False)

## Method for Testing

In [None]:
# This method is adapted from the original source-code on:
# https://github.com/parthpatwa/covid19-fake-news-detection/blob/main/ml_baseline-test.ipynb
def test_baseline(pipeline, method_title):
  print(f'Testing Baseline of {method_title}')

  # get the starting time
  t0 = time()

  # fit the model with the data
  fit = pipeline.fit(train['tweet'],train['label'])

  pred=pipeline.predict(test['tweet'])

  # display the elapsed time and performance scores
  print(f'Time: {time() - t0} \nPerformance score:')

  print_metrices(pred, test['label'])
  plot_confusion_matrix(confusion_matrix(val['label'], pred),
                        target_names=['fake','real'],
                        normalize = False,
                        title = f'Confusion matix of {method_title} on Test data')

  test_ori = pd.read_csv(PATH + 'Constraint_Test.csv')
  test_misclass_df = test_ori[pred!=test['label']]
  test_misclass_df.info()
  test_misclass_df.to_csv(PATH + f'{method_title}_test_misclassified.csv', index=False)

## Gradient Boosting

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
        ('bow', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('c', GradientBoostingClassifier())
    ])

train_val_baseline(pipeline, 'GDBT')
print('\n\n')
test_baseline(pipeline, 'GDBT')

## Decision Tree

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
        ('bow', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('c', tree.DecisionTreeClassifier())
    ])

train_val_baseline(pipeline, 'DT')
print('\n\n')
test_baseline(pipeline, 'DT')

## Multinomial Naive Bayes

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
        ('bow', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('c', MultinomialNB(alpha=0.1))
    ])

train_val_baseline(pipeline, 'MNB')
print('\n\n')
test_baseline(pipeline, 'MNB')

## Complement Naive Bayes

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
        ('bow', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('c', ComplementNB(alpha=0.1))
    ])

train_val_baseline(pipeline, 'MNC')
print('\n\n')
test_baseline(pipeline, 'MNC')

#Choosing the Right Hyperparameter for Training

## Use Hyperparameter Searching Method

In [None]:
def hyper_param_search(pipeline, parameter, file_name, cv_range = range(5,11)):
  performance_array = []
  # Perform the grid search
  for i in cv_range:
      grid_search = GridSearchCV(pipeline, parameters, cv=i, n_jobs=2, verbose=1)
      t0 = time()
      grid_search.fit(train['tweet'], train['label'])
      gs_time = time() - t0

      # Display the best parameters, its testing score, and validation score
      gs_best_param = grid_search.best_params_
      gs_best_score = grid_search.best_score_
      gs_val_score = grid_search.score(val['tweet'], val['label'])

      # Print the best parameters and score for each cross-validation value
      print("Cross-validation value: ", i)
      print("Best parameters: ", gs_best_param)
      print("Best Train score: ", gs_best_score)
      print('Validation Score:', gs_val_score)
      print('Runtime:', gs_time)

      # Populate the parameters, scores, and running time for later table
      performance_array.append([i, gs_best_param,
                                gs_best_score,
                                gs_val_score,
                                gs_time])

  # convert the previously collected performance array into Panda DataFrame
  # and arrange the columns
  data_df = pd.DataFrame(performance_array, columns=['k', 'best_param',
                                                     'best_score', 'val_score',
                                                     'time'])

  # Write the DataFrame to a CSV file
  data_df.to_csv(PATH + file_name, index=False)

##GradientBoosting

###Without SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE))
])

# Define the parameters to be tuned
parameters = {
    'gb__n_estimators': [750, 1000, 1500],
    'gb__learning_rate': [1.0, 0.1, 0.01],
    'gb__subsample': np.arange(0.5, 1.1, 0.1).tolist()
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'GB_HYPERPARAM.csv')

####Pipeline

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE,
                                      learning_rate=0.1, n_estimators=1500,
                                      subsample=0.7999999999999999))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'GB with Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'GB with Hyperparameter Tuned')

###With SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE,
                       sampling_strategy='minority',
                       n_jobs=-1)),
    ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE,
                                      learning_rate=0.1, n_estimators=1500,
                                      subsample=0.7999999999999999))
])

# Define the parameters to be tuned
parameters = {
    'sampling__sampling_strategy': ['minority', 'not minority', 'all'],
    'sampling__k_neighbors': np.arange(1, 101).tolist()
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'GB_SMOTE_HYPERPARAM.csv')

####Pipeline

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE,
                       sampling_strategy='minority',
                       k_neighbors=2)),
    ('gb', GradientBoostingClassifier(random_state=RANDOM_STATE,
                                      learning_rate=0.1, n_estimators=1500,
                                      subsample=0.7999999999999999))
])

####Validation

In [None]:
train_val_baseline(pipeline, 'GB and SMOTE (k=2) with Hyperparameter Tuned')

####Testing

In [None]:
test_baseline(pipeline, 'GB and SMOTE (k=2) with Hyperparameter Tuned')

##Decision Tree

###Without SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dt', tree.DecisionTreeClassifier(random_state=RANDOM_STATE))
])

# Define the parameters to be tuned
parameters = {
    'dt__min_samples_split': np.arange(2, 21, 2).tolist(),
    'dt__class_weight': [None, 'balanced']
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'DT_HYPERPARAM.csv')

####Pipeline

In [None]:
# Random State to ensure a reproduceable result
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dt', tree.DecisionTreeClassifier(random_state=RANDOM_STATE,
                                       class_weight=None,
                                       min_samples_split=16))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'DT with Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'DT with Hyperparameter Tuned')

###With SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE, n_jobs=-1)),
    ('dt', tree.DecisionTreeClassifier(random_state=RANDOM_STATE,
                                       min_samples_split=16,
                                       class_weight=None))
])

# Define the parameters to be tuned
parameters = {
    'sampling__sampling_strategy': ['minority', 'not minority', 'all'],
    'sampling__k_neighbors': np.arange(1, 101).tolist()
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'DT_SMOTE_HYPERPARAM.csv')

####Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE,
                       k_neighbors=29,
                       sampling_strategy='minority')),
    ('dt', tree.DecisionTreeClassifier(random_state=RANDOM_STATE,
                                       class_weight=None,
                                       min_samples_split=16))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'DT and SMOTE (k=29) with Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'DT and SMOTE (k=29) with Hyperparameter Tuned')

##Multinomial Naive Bayes

###Without SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('mnb', MultinomialNB())
])

# Define the parameters to be tuned
parameters = {
    'mnb__alpha': [100, 10, 1.0, 0.1, 0.01]#,
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'MNB_HYPERPARAM.csv')


####Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('mnb', MultinomialNB(alpha=0.1))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'MNB with Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'MNB with Hyperparameter Tuned')

###With SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE, n_jobs=-1)),
    ('mnb', MultinomialNB(alpha=0.1))
])

# Define the parameters to be tuned
parameters = {
    'sampling__sampling_strategy': ['minority', 'not minority', 'all'],
    'sampling__k_neighbors': np.arange(1, 101).tolist()
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'MNB_SMOTE_HYPERPARAM.csv')

####Pipeline

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE,
                       k_neighbors=45,
                       sampling_strategy='minority')),
    ('mnb', MultinomialNB(alpha=0.1))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'MNB and SMOTE (k=45) with Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'MNB and SMOTE (k=45) with Hyperparameter Tuned')

##Complement Naive Bayes

###Without SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('mnc', ComplementNB())
])

# Define the parameters to be tuned
parameters = {
    'mnc__norm': [True, False],
    'mnc__alpha': [100, 10, 1.0, 0.1, 0.01]
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'MNC_HYPERPARAM.csv')


####Pipeline

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('mnc', ComplementNB(alpha=0.1))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'MNC Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'MNC Hyperparameter Tuned')

###With SMOTE

####Finding the best parameters

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE, n_jobs=-1)),
    ('mnc', ComplementNB(alpha=0.1))
])

# Define the parameters to be tuned
parameters = {
    'sampling__sampling_strategy': ['minority', 'not minority', 'all'],
    'sampling__k_neighbors': np.arange(1, 101).tolist()
}

# Perform hyperparameter search
hyper_param_search(pipeline, parameters, 'MNC_SMOTE_HYPERPARAM.csv')

####Pipeline

In [None]:
#minor variations in final results due to randomness
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('sampling', SMOTE(random_state=RANDOM_STATE, k_neighbors=45,
                       sampling_strategy='minority')),
    ('mnc', ComplementNB(alpha=0.1))
])


####Validation

In [None]:
train_val_baseline(pipeline, 'MNC and SMOTE (k=45) Hyperparameter Tuned')


#### Testing

In [None]:
test_baseline(pipeline, 'MNC and SMOTE (k=45) Hyperparameter Tuned')