In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn import tree

In [2]:
# read files
train_filename = "Constraint_English_Train.xlsx"
val_filename = "Constraint_English_Val.xlsx"
test_filename = "Constraint_English_Test.xlsx"

train_data = pd.read_excel(train_filename)
validation_data = pd.read_excel(val_filename)
test_data = pd.read_excel(test_filename)

In [3]:
#check size
print("Size of the training data: ", train_data.shape)
print("Size of the validation data: ", validation_data.shape)
print("Size of the test data: ", test_data.shape)

Size of the training data:  (6420, 3)
Size of the validation data:  (2140, 3)
Size of the test data:  (2140, 3)


In [4]:
# label transform
train_data['training_label']= train_data.label.replace({'real':0,'fake':1})
validation_data['validation_label']= validation_data.label.replace({'real':0,'fake':1})
test_data['validation_label']= test_data.label.replace({'real':0,'fake':1})

In [5]:
# data cleaning
punctuations = '''’'!()-[]{};:'"\,<>./?@#$%^&*_~�'''

def remove_punctuation_url(d):
    d = d.lower()
    #remove url
    d = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', d, flags=re.MULTILINE)
    review = d.replace('\n', '')
    no_punct = ""
    #remove punc
    for char in review:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def remove_stopwords(d):
    text_tokens = word_tokenize(d.lower())
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]
    ls = ""
    for w in tokens_without_sw:
        ls = ls +" "+w.lower()
    return ls

train_data['cleaned'] = train_data['tweet'].apply(remove_punctuation_url)
validation_data['cleaned'] = validation_data['tweet'].apply(remove_punctuation_url)
test_data['cleaned'] = test_data['tweet'].apply(remove_punctuation_url)

In [6]:
def print_metrices(pred,true):
    print(confusion_matrix(true,pred))
    print(classification_report(true,pred,))
    print("Accuracy : ",accuracy_score(pred,true))
    print("Precison : ",precision_score(pred,true, average = 'weighted'))
    print("Recall : ",recall_score(pred,true,  average = 'weighted'))
    print("F1 : ",f1_score(pred,true,  average = 'weighted'))
    

In [39]:
#########LinearSVM#########

pipeline = Pipeline([
        ('bow', CountVectorizer()),  
        ('tfidf', TfidfTransformer()),  
        ('c', LinearSVC())
    ])
fit = pipeline.fit(train_data['tweet'],train_data['label'])
print('SVM(Linear)')
print ('val:')
pred=pipeline.predict(validation_data['tweet'])
print_metrices(pred,validation_data['label'])
pred=pipeline.predict(test_data['tweet'])
print_metrices(pred,test_data['label'])

SVM(Linear)
val:
[[ 936   84]
 [  48 1072]]
              precision    recall  f1-score   support

        fake       0.95      0.92      0.93      1020
        real       0.93      0.96      0.94      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9383177570093458
Precison :  0.9389821723081755
Recall :  0.9383177570093458
F1 :  0.9383839682296299
[[ 942   78]
 [  51 1069]]
              precision    recall  f1-score   support

        fake       0.95      0.92      0.94      1020
        real       0.93      0.95      0.94      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9397196261682244
Precison :  0.9401099259797377
Recall :  0.9397196261682244
F1 :  0.9397649209453509


In [40]:
#########SVM(poly3)#########

pipeline = Pipeline([
        ('bow', CountVectorizer()),  
        ('tfidf', TfidfTransformer()),  
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=2, C=10))
    ])
fit = pipeline.fit(train_data['tweet'],train_data['label'])
print('SVM(poly3)')
print ('val:')
pred=pipeline.predict(validation_data['tweet'])
print_metrices(pred,validation_data['label'])
pred=pipeline.predict(test_data['tweet'])
print_metrices(pred,test_data['label'])

SVM(poly3)
val:
[[ 940   80]
 [  42 1078]]
              precision    recall  f1-score   support

        fake       0.96      0.92      0.94      1020
        real       0.93      0.96      0.95      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9429906542056075
Precison :  0.9437174729704966
Recall :  0.9429906542056075
F1 :  0.943056206960435
[[ 945   75]
 [  51 1069]]
              precision    recall  f1-score   support

        fake       0.95      0.93      0.94      1020
        real       0.93      0.95      0.94      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9411214953271028
Precison :  0.941435443336213
Recall :  0.9411214953271028
F1 :  0.9411598857369308


In [42]:
#########SVM(rbf)#########
pipeline = Pipeline([
        ('bow', CountVectorizer()),  
        ('tfidf', TfidfTransformer()),  
        ("svm_clf", SVC(kernel="rbf", gamma=1, C=10))
    ])
fit = pipeline.fit(train_data['tweet'],train_data['label'])
print('SVM(rbf)')
print ('val:')
pred=pipeline.predict(validation_data['tweet'])
print_metrices(pred,validation_data['label'])
pred=pipeline.predict(test_data['tweet'])
print_metrices(pred,test_data['label'])

SVM(rbf)
val:
[[ 940   80]
 [  48 1072]]
              precision    recall  f1-score   support

        fake       0.95      0.92      0.94      1020
        real       0.93      0.96      0.94      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9401869158878504
Precison :  0.9407188669860467
Recall :  0.9401869158878504
F1 :  0.9402422952654304
[[ 947   73]
 [  50 1070]]
              precision    recall  f1-score   support

        fake       0.95      0.93      0.94      1020
        real       0.94      0.96      0.95      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9425233644859813
Precison :  0.9428127536061154
Recall :  0.9425233644859813
F1 :  0.9425589877601502


In [43]:
#########Decision Tree#########
pipeline = Pipeline([
        ('bow', CountVectorizer()),  
        ('tfidf', TfidfTransformer()),  
        ('c', tree.DecisionTreeClassifier())
    ])
fit = pipeline.fit(train_data['tweet'],train_data['label'])
print('Decision Tree')
print ('val:')
pred=pipeline.predict(validation_data['tweet'])
print_metrices(pred,validation_data['label'])
pred=pipeline.predict(test_data['tweet'])
print_metrices(pred,test_data['label'])

Decision Tree
val:
[[887 133]
 [131 989]]
              precision    recall  f1-score   support

        fake       0.87      0.87      0.87      1020
        real       0.88      0.88      0.88      1120

    accuracy                           0.88      2140
   macro avg       0.88      0.88      0.88      2140
weighted avg       0.88      0.88      0.88      2140

Accuracy :  0.8766355140186916
Precison :  0.8766480634309799
Recall :  0.8766355140186916
F1 :  0.876641021848765
[[876 144]
 [128 992]]
              precision    recall  f1-score   support

        fake       0.87      0.86      0.87      1020
        real       0.87      0.89      0.88      1120

    accuracy                           0.87      2140
   macro avg       0.87      0.87      0.87      2140
weighted avg       0.87      0.87      0.87      2140

Accuracy :  0.8728971962616823
Precison :  0.8730982486452525
Recall :  0.8728971962616823
F1 :  0.8729488597229612


In [44]:
#########Logistic Regression#########
pipeline = Pipeline([
        ('bow', CountVectorizer()),  
        ('tfidf', TfidfTransformer()),  
        ('c', LogisticRegression())
    ])
fit = pipeline.fit(train_data['tweet'],train_data['label'])
print('Logistic Regression')
print ('val:')
pred=pipeline.predict(validation_data['tweet'])
print_metrices(pred,validation_data['label'])
pred=pipeline.predict(test_data['tweet'])
print_metrices(pred,test_data['label'])

Logistic Regression
val:
[[ 925   95]
 [  62 1058]]
              precision    recall  f1-score   support

        fake       0.94      0.91      0.92      1020
        real       0.92      0.94      0.93      1120

    accuracy                           0.93      2140
   macro avg       0.93      0.93      0.93      2140
weighted avg       0.93      0.93      0.93      2140

Accuracy :  0.9266355140186916
Precison :  0.9272181045315322
Recall :  0.9266355140186916
F1 :  0.9267060977562901
[[ 928   92]
 [  63 1057]]
              precision    recall  f1-score   support

        fake       0.94      0.91      0.92      1020
        real       0.92      0.94      0.93      1120

    accuracy                           0.93      2140
   macro avg       0.93      0.93      0.93      2140
weighted avg       0.93      0.93      0.93      2140

Accuracy :  0.927570093457944
Precison :  0.9280301104086495
Recall :  0.927570093457944
F1 :  0.9276294760384962
