In [1]:
import pandas as pd
import numpy as np
import nltk as nlp
import sklearn
import matplotlib as plt
%matplotlib inline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
from sklearn import metrics

In [4]:
#fake['label'] = 0
#real['label'] = 1
data = pd.read_csv('./Snopes-data.csv')
data

Unnamed: 0,Text,Label
0,[During anti-police-brutality protests that sw...,0
1,"[In June 2020, a rumor started to circulate on...",0
2,[Rumors are surging in the wake of George Floy...,0
3,"[On June 4, 2020, a security fence was erected...",0
4,"[In early June 2020, social media users shared...",0
...,...,...
10370,"When you book a hotel room online, you expect ...",4
10371,It’s tough enough to find a job or start your ...,4
10372,The Federal Trade Commission cracked down on a...,4
10373,"""There is currently money available NOW right ...",4


In [5]:
data = data.sample(frac=1)

In [6]:
X = data['Text']
y = data['Label']

In [None]:
#preprocessing:
X = X.str.lower()
X = X.str.replace('[^\w\s]','')
X = X.apply(lambda row: nlp.word_tokenize(row))
stop = nlp.corpus.stopwords.words('english')
X = X.apply(lambda x: [item for item in x if item not in stop])

In [7]:
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', lowercase=False, max_features=1000)

In [8]:
features = vectorizer.fit_transform(X)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [10]:
classifier = LinearSVC()

In [11]:
classifier.fit(X_train_tfidf, y_train.values.ravel())

LinearSVC()

In [None]:
count_train = [0]*5
count_test = [0]*5
for a in y_train:
    count_train[a] = count_train[a] + 1
print(count_train)
for a in y_test:
    count_test[a] = count_test[a] + 1
print(count_test)

In [None]:
pieLabels = ['Fake','Real','Mixture','Miscaptioned','Scam']
figureObject, axesObject = plt.pyplot.subplots()
axesObject.pie(count_train, labels = pieLabels, autopct='%1.2f', startangle=90)
axesObject.axis('equal')
plt.pyplot.show()
figureObject, axesObject = plt.pyplot.subplots()
axesObject.pie(count_test, labels = pieLabels, autopct='%1.2f', startangle=90)
axesObject.axis('equal')
plt.pyplot.show()

In [12]:
X_test_counts = count_vect.transform(X_test)

In [13]:
prediction = classifier.predict(X_test_counts)

In [14]:
accuracy = metrics.accuracy_score(y_test, prediction)

In [15]:
accuracy

0.6276903308705429

In [16]:
#classification report
#precision: Number of true positives divided by total number of data points labelled positive, ie. TP/(TP + FP)
#recall: proportion of positives which were labelled correctly, ie. TP/(TP + FN)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.42      0.98      0.59       819
           1       1.00      0.65      0.78       908
           2       0.87      0.40      0.55       579
           3       0.97      0.43      0.59       392
           4       0.92      0.40      0.55       415

    accuracy                           0.63      3113
   macro avg       0.83      0.57      0.61      3113
weighted avg       0.81      0.63      0.63      3113

[[801   0  18   0   0]
 [283 587  17   6  15]
 [346   0 233   0   0]
 [224   0   0 168   0]
 [249   1   0   0 165]]


In [None]:
#hyperparameter tuning
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,1,10,100,1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001], 'kernel':['rbf']}
grid = GridSearchCV(classifier, param_grid, refit = True, verbose = 3)
grid.fit(X_train_tfidf, y_train.values.ravel())

In [None]:
print(grid.best_params_)

In [None]:
print(grid.best_estimator_)

In [None]:
fine_grid_params = {'C': [90,95,100,105,110],'gamma':[0.009,0.01,0.02], 'kernel':['rbf']}
fine_grid = GridSearchCV(classifier, fine_grid_params, refit = True, verbose = 3)
fine_grid.fit(X_train_tfidf, y_train.values.ravel())

In [None]:
#grid_predictions = grid.predict(X_test_counts)
#print(classification_report(y_test, grid_predictions))
#accuracy = metrics.accuracy_score(y_test, grid_predictions)
#print(accuracy)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
labels = y
models = [RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), MultinomialNB(), LogisticRegression(random_state=0),]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)


In [None]:
text = ['NEW DELHI: An Air India aircraft winging its way from Delhi to Moscow on Saturday morning had to be called back to Delhi from over Uzbekistan after the airline realised that one of the pilots onboard had tested corona positive. An oversight by the team checking pre-flight test reports of crew members had mistakenly read this captain’s positive report as negative and released him for the ferry flight (meaning with no passengers and only crew) to fly back Indians from Moscow.\nThe Airbus A-320 Neo (VT-EXR) returned to Delhi at about 12.30 pm and now the crew will be quarantined as per norms. This plane will be fumigated and the airline is sending another A320 Neo to Moscow later on Saturday afternoon to fly back Indians from there.']