- Database: Covid-fake
- Function: cleaning
- Desp: NA

# Necessary Imports...

In [None]:
# Start writing code here...
import pandas as pd
import xgboost
import re
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import f1_score


# Read test and training data

In [None]:
train = pd.read_csv('../datasets/covid/Constraint_English_Train - Sheet1.csv')
test = pd.read_csv('../datasets/covid/Constraint_English_Val - Sheet1.csv')

In [None]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real
5,6,"Covid Act Now found ""on average each person in...",real
6,7,If you tested positive for #COVID19 and have n...,real
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,fake
8,9,"???Clearly, the Obama administration did not l...",fake
9,10,Retraction—Hydroxychloroquine or chloroquine w...,fake


In [None]:
labels = ['fake','real']
def label_encode(val):
    return labels.index(val)

# Label Encoding

In [None]:
train.label = train.label.apply(label_encode)

In [None]:
train.head(10)

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,1
1,2,States reported 1121 deaths a small rise from ...,1
2,3,Politically Correct Woman (Almost) Uses Pandem...,0
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,1
4,5,Populous states can generate large case counts...,1
5,6,"Covid Act Now found ""on average each person in...",1
6,7,If you tested positive for #COVID19 and have n...,1
7,8,Obama Calls Trump’s Coronavirus Response A Cha...,0
8,9,"???Clearly, the Obama administration did not l...",0
9,10,Retraction—Hydroxychloroquine or chloroquine w...,0


# Cleaning training and test data

In [None]:
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = []

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
train.tweet = train.tweet.apply(clean_text)
train.tweet = train.tweet.str.replace('\d+', '')

### Preparing Test Data

In [None]:
test.label = test.label.apply(label_encode)
test = test.reset_index(drop=True)
test.tweet = test.tweet.apply(clean_text)
test.tweet = test.tweet.str.replace('\d+', '')

In [None]:
train.tweet.sample(10)

5688    our daily update is published weve now tracked...
530     new podcast can wearable tech + ai converge to...
2313    coroanvirus northern ireland to see tighter re...
3341    whitwyatt the pending data from california has...
6031    deaths rose only marginally nationally from la...
4908    a liquid mixture is being sold with claims tha...
5491    shopper was wearing facemask backtofront https...
6252    #indiafightscorona india has the highest recov...
3401    bill de blasio the mayor of new york city said...
2155    update from the minhealthnz today we have  new...
Name: tweet, dtype: object

# Base Line Model Used
## 1. Naive Bayes
## 2. Linear Classifier
## 3. Bagging
## 4. Boosting
## 5. SVM

### Building Model

In [None]:
def train_model(classifier, feature_vector_train, label,  feature_vector_valid, valid_y,test_data , test_label ,is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #print("In Validation Data",metrics.accuracy_score(predictions, valid_y))
    
    
    #applying in test data
    predictions_test = classifier.predict(test_data)
    
    if is_neural_net:
        predictions_test = predictions_test.argmax(axis=-1)
    print("f1 score: ",f1_score(test_label,predictions_test))
        
    return metrics.accuracy_score(test_label,predictions_test)
    

### 1.Splitting the Data into Train and validation

In [None]:
# split the dataset into training and validation datasets 
from sklearn.model_selection import train_test_split
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train['tweet'], train['label'])
train_x, valid_x, train_y, valid_y = train_test_split(train['tweet'], train['label'], test_size=0.33)

### 2. Applying WordLevel tf-idf and bi-gram tf-idf

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train['tweet'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
test_tfidf   =  tfidf_vect.transform(test['tweet'])

# ngram level tf-idf (bigram in this case)
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train['tweet'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
test_tfidf_ngram   =  tfidf_vect.transform(test['tweet'])

#  Naive Bayes Model 

In [None]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
print ("NB, Bi-Gram Vectors: ", accuracy)

f1 score:  0.9053461875547766
NB, WordLevel TF-IDF:  0.8990654205607477
f1 score:  0.5816285599679103
NB, Bi-Gram Vectors:  0.5126168224299066


# Linear Classifier

In [None]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(),  xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
print("LR, Bi-Gram Vectors: ", accuracy)

f1 score:  0.9130434782608695
LR, WordLevel TF-IDF:  0.908411214953271
f1 score:  0.6202581149784905
LR, Bi-Gram Vectors:  0.5462616822429907


# Bagging Model

In [None]:
# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, valid_y, test_tfidf, test['label'])
print ("RF, WordLevel TF-IDF: ", accuracy)

# RF on ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, valid_y ,test_tfidf_ngram, test['label'])
print ("RF, Bi-gram TF-IDF: ", accuracy)

f1 score:  0.9034543844109832
RF, WordLevel TF-IDF:  0.8981308411214953
f1 score:  0.18512898330804248
RF, Bi-gram TF-IDF:  0.4981308411214953


# Boosting Model

In [None]:

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y, test_tfidf.tocsc(), test['label'])
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc(), valid_y, test_tfidf_ngram.tocsc(), test['label'])
print("Xgb, Bi-gram TF-IDF: ", accuracy)

f1 score:  0.9133510167992925
Xgb, WordLevel TF-IDF:  0.908411214953271
f1 score:  0.06627017841971113
Xgb, Bi-gram TF-IDF:  0.4864485981308411


# SVM Model

In [None]:

#SVM Model on Unigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc(), valid_y, test_tfidf.tocsc(), test['label'])
print("SVM, WordLevel TF-IDF: ", accuracy)

# SVM Model on Bigram TF-IDF
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc(), valid_y, test_tfidf_ngram.tocsc(), test['label'])
print("SVM, Bi-gram TF-IDF: ", accuracy)

f1 score:  0.9226642825212338
SVM, WordLevel TF-IDF:  0.9191588785046729
f1 score:  0.5454545454545454
SVM, Bi-gram TF-IDF:  0.5280373831775701
