# Perform necessary Imports

In [1]:
import pandas as pd
import xgboost
import re
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import f1_score


# Read data

In [2]:
totalData = pd.read_csv('../datasets/nela10.csv')

# Drop all columns except title and reliability(label)

In [3]:
totalData = totalData.drop(['id','date','source','content','author','url','published','published_utc','collection_utc'],axis=1)

# How it looks like?

In [4]:
totalData

Unnamed: 0,title,Reliability
0,"Who is Stacey Abrams, the Democrat who will re...",0
1,Ban government shutdowns? Some Republicans and...,0
2,Democrats renew push for equal pay for equal work,0
3,Trump attacks intel chiefs after they contradi...,0
4,Border security battle begins as Congress grap...,0
...,...,...
57152,Hong Kong Officer Faces Death Threats After Fi...,2
57153,"Enough ""Quid Pro Quo"" Gaslighting!",2
57154,"""Born For This? I Don't Think So"" - Trump Mock...",2
57155,How Iran Used Google To Disrupt 5% Of Global O...,2


# Check  if label=1(mixed) exists 

In [5]:
totalData.Reliability.unique()

array([0, 2])

# Clean Dataset

In [6]:
totalData.title=totalData.title.astype(str)

In [7]:
totalData = totalData.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
totalData.title = totalData.title.apply(clean_text)
totalData.title = totalData.title.str.replace('\d+', '')

# test train split 

In [8]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(totalData['title'], totalData['Reliability'], test_size=0.20)

# Develop Unigram and Bigram Tf-Idf feature vectors from data

In [9]:
# unigram level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=None)
tfidf_vect.fit(totalData['title'])
xtrain_tfidf_unigram =  tfidf_vect.transform(train_x)
xvalid_tfidf_unigram =  tfidf_vect.transform(valid_x)

# bigram level tf-idf (bigram in this case)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,2), max_features=None)
tfidf_vect_ngram.fit(totalData['title'])
xtrain_tfidf_bigram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_bigram =  tfidf_vect_ngram.transform(valid_x)


# Build Model functions

In [10]:
def train_model(classifier, feature_vector_train, label,  feature_vector_valid, valid_y,is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #print("In Validation Data",metrics.accuracy_score(predictions, valid_y))
    print("f1 score: ",f1_score(valid_y,predictions,average='weighted'))        
    return metrics.accuracy_score(valid_y,predictions)
    

# Base Line Model Used
## 1. Naive Bayes
## 2. Linear Classifier
## 3. Bagging
## 4. Boosting
## 5. SVM

#  Naive Bayes Model 

In [11]:
# Naive Bayes on Word Level TF IDF Vectors
print("For Unigram Tf-IDF features vectors using Naive Bayes")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print ("Accuracy: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
print("For Bigram Tf-IDF features vectors using Naive Bayes")
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram, valid_y)
print ("Accuracy: ", accuracy)

For Unigram Tf-IDF features vectors using Naive Bayes
f1 score:  0.7113740893743468
Accuracy:  0.7870013995801259
For Bigram Tf-IDF features vectors using Naive Bayes
f1 score:  0.699537036800892
Accuracy:  0.783852344296711


# Linear Classifier

In [12]:
# Linear Classifier on Word Level TF IDF Vectors
print("For Unigram TF-Idf feature vectors using Logistic Regression")
accuracy = train_model(linear_model.LogisticRegression(max_iter=500), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print("accuracy: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
print("For Bigram TF-Idf feature vectors using Logistic Regression")
accuracy = train_model(linear_model.LogisticRegression(max_iter=500),  xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram, valid_y)
print("Accuracy: ", accuracy)

For Unigram TF-Idf feature vectors using Logistic Regression
f1 score:  0.7882306617831591
accuracy:  0.8206787963610916
For Bigram TF-Idf feature vectors using Logistic Regression
f1 score:  0.6923977487925838
Accuracy:  0.7804408677396781


# Bagging Model

In [13]:
# RF on Word Level TF IDF Vectors
print("For Unigram Tf-Idf feature vectors using Random Forest Classifier")
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_unigram, train_y, xvalid_tfidf_unigram, valid_y)
print ("Accuracy: ", accuracy)

# RF on ngram Level TF IDF Vectors
print("For Bigram Tf-Idf feature vectors using Random Forest Classifier")
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_bigram, train_y, xvalid_tfidf_bigram, valid_y)
print ("Accuracy: ", accuracy)

For Unigram Tf-Idf feature vectors using Random Forest Classifier
f1 score:  0.7945991992559391
Accuracy:  0.8280265920223933
For Bigram Tf-Idf feature vectors using Random Forest Classifier
f1 score:  0.7125370023304538
Accuracy:  0.7032015395381386


# Boosting Model

In [14]:
print("For Unigram Tf-Idf feature vectors using Extreme Gradient Boosting")
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_unigram.tocsc(), train_y, xvalid_tfidf_unigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

# Extereme Gradient Boosting on ngram Level TF IDF Vectors
print("For Bigram Tf-Idf feature vectors using Extreme Gradient Boosting")
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_bigram.tocsc(), train_y, xvalid_tfidf_bigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

For Unigram Tf-Idf feature vectors using Extreme Gradient Boosting
f1 score:  0.7780950529194122
Accuracy:  0.8152554233729881
For Bigram Tf-Idf feature vectors using Extreme Gradient Boosting
f1 score:  0.7121169688114273
Accuracy:  0.7857767669699091


# SVM Model

In [15]:
print("For Unigram Tf-Idf feature vectors using SVM")
#SVM Model on Unigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf_unigram.tocsc(), train_y, xvalid_tfidf_unigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

# SVM Model on Bigram TF-IDF
print("For Bigram Tf-Idf feature vectors using SVM")
accuracy = train_model(svm.SVC(), xtrain_tfidf_bigram.tocsc(), train_y, xvalid_tfidf_bigram.tocsc(), valid_y)
print("Accuracy: ", accuracy)

For Unigram Tf-Idf feature vectors using SVM
f1 score:  0.8012498224867763
Accuracy:  0.8313505948215535
For Bigram Tf-Idf feature vectors using SVM
f1 score:  0.7279851890373011
Accuracy:  0.7967984604618614
