In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Visualization imports
import seaborn as sns
%matplotlib inline
import plotly.express as px

#NLP imports
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer, TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import recall_score, f1_score, accuracy_score, precision_score
import heapq
import re
import nltk
import networkx as nx
from gensim.models import word2vec

#Scikit Learn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

import pickle

In [3]:
#Reading Data
df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [None]:
lemmatizer = WordNetLemmatizer()
corpus = []
for i in range(0, len(df)):
    review = re.sub(r"http\S+", "", df['text'][i]) # This code removes links from text
    review = re.sub('[^a-zA-Z\d+]', ' ', review) 
    review = re.sub('[0-9]', '', review) 
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word, pos = 'v') for word in review if not word in stopwords.words('english')]
    review = [lemmatizer.lemmatize(word, pos = 'n') for word in review]
    review = [lemmatizer.lemmatize(word, pos = 'a') for word in review] 
    review = ' '.join(review)
    corpus.append(review)

### Baseline Calculation

In [None]:
df['target'].value_counts(normalize = True)*100

##### If the model has an accuracy score of more than 57% then the model is doing better than a baseline

In [None]:
corpus[0]

##### Looking at the first line of corpus we can see that the data is clean

In [None]:
## Creating a Dictionary to see most frequent words
wordfreq = {}
for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

In [None]:
## Using heap module in python to see 10 most frequent words
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)
most_freq[0:10]

In [None]:
## create features for Bag of words manually, counter vectorizer can do this for us
sentence_vectors = []
for sentence in corpus:
    sentence_tokens = nltk.word_tokenize(sentence)
    sent_vec = []
    for token in most_freq:
        if token in sentence_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)
sentence_vectors = np.asarray(sentence_vectors)
sentence_vectors

In [None]:
#Instantiating Countervectorizer
cv = CountVectorizer(max_features=1000, ngram_range = (1,2))
X = cv.fit_transform(corpus).toarray()
y = df['target']

##### The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
[Ref](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

In [None]:
### Splitting data for training and test data and applying Naive Bayes Classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
clf = MultinomialNB().fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)

In [None]:
print("Training set score using Naive Bayes Classifier: {:.2f}".format(clf.score(X_train, y_train)))
print("Testing set score using Naive Bayes Classifier: {:.2f}" .format(clf.score(X_test, y_test)))

In [None]:
#Confusion matrix plot
plot_confusion_matrix(clf,X_test, y_test, cmap = 'Blues')
plt.title("Confusion Matrix using Naive Bayes Classifier");

##### This model predicts 329 False Negatives and 163 False Positive and the rest were predicted accurately and I used different parameter's like binary = true and ngram_range = (2,3) which resulted in less accuracy score

In [None]:
# Calculating Predicted Probabilities
y_pred_proba = clf.predict_proba(X_test)[:,1]
y_pred_proba

In [None]:
print(f'Accuracy score using Naive Bayes Classifier: {round(accuracy_score(y_test, y_pred_clf),2)}')
print(f'Recall score using Naive Bayes Classifier: {round(recall_score(y_test, y_pred_clf),2)}')
print(f'F1 score using Naive Bayes Classifier: {round(f1_score(y_test, y_pred_clf),2)}')
print(f'Precision score using Naive Bayes Classifier: {round(precision_score(y_test, y_pred_clf),2)}')

In [None]:
fpr,tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr,tpr, color = 'orange', lw =2)
plt.plot([0, 1], [0, 1], lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title("ROC Curve of Real and Fake tweets using NB Classifier", color = 'blue')
plt.xlabel('False Possitive Rate(1-Specificity)')
plt.ylabel('True Possitive Rate(Sensitivity)')
plt.legend()
plt.grid(True)

In [None]:
print("The area under ROC CURVE using Naive Bayes {:.2f}".format(roc_auc_score(y_test, y_pred_proba)))

##### We can see that the model is definitely performing better than the baseline model and the Area under the curve is 0.84. AUC ranges from 0 to 1. Higher AUC means better perfomance of model in differentiating possitive and negative classes

In [None]:
#Instantiating Logistic Regression Model
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print('Training Accuracy score using Logistic Regression:   {:.2f}'.format(train.score(X_train, y_train)))
print('Test Accuracy score:   {:.2f}'.format(train.score(X_test, y_test)))

In [None]:
plot_confusion_matrix(lr, X_test, y_test, cmap = 'Blues')
plt.title("Confusion Matrix using Logistic Regression");

##### It appears from the confusion matrix that the logistic regression model is classifying better than the Naive Bayes model, here the model predicts 195 False positives and 277 False Negatives. The results are similar to Naive Bayes classifier but slightly better

In [None]:
print(classification_report(y_test, y_pred))

##### The F1score, accuracy, precision and recall scores are also slightly better than Naive Bayes model

In [None]:
y_pred_proba_lr = lr.predict_proba(X_test)[:,1]
y_pred_proba_lr

In [None]:
fpr,tpr, thresholds = roc_curve(y_test, y_pred_proba_lr)
plt.plot(fpr,tpr, lw =2, color = 'orange')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title("ROC Curve of real and fake tweets using Logistic Regression")
plt.xlabel('False Possitive Rate(1-Specificity)')
plt.ylabel('True Possitive Rate(Sensitivity)')
plt.legend()
plt.grid(True)

In [None]:
print("The area under ROC CURVE using Logistic Regression {:.2f}".format(roc_auc_score(y_test, y_pred_proba_lr)))

In [None]:
#Instantiating Random grid for RFC
# Number of trees in random forest
n_estimators = [int(i) for i in np.linspace(100, 1100, 100)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Max number of levels in tree
max_depth = [None, 1,2,3,4,5,6,7]
# Minimum number of samples required to split a node
min_samples_split = [1,3,4,5,7,9]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2,4,6,8]
criterion = ['entropy', 'gini']
rf_grid = {'n_estimators' : n_estimators,
              'max_features': max_features,
              'max_depth' : max_depth,
              'min_samples_split' : min_samples_split,
              'min_samples_leaf' : min_samples_leaf,
              'criterion' : criterion}

In [None]:
rf1 = RandomForestClassifier()
rscv = RandomizedSearchCV(rf1, 
                          param_distributions = rf_grid, 
                          n_iter = 100,
                          cv = 5,
                          n_jobs = -1,
                          verbose =2,
                          random_state = 42)

In [None]:
%%time
rscv.fit(X_train, y_train)
print(rscv.score(X_train, y_train))
print(rscv.score(X_test, y_test))

##### The Random Forest Clasifier with Randomseachcv took about 9 hours and the results were similar to previously performed Logistic Regression and Naive Bayes Models

In [None]:
rscv.best_estimator_

In [None]:
#Instantiate RFC with GridsearchCV
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [400,500,600],
    'max_depth': [None, 1, 2, 3, 4, 5, 6, 7],
    'min_samples_leaf': [2],
    'min_samples_split': [5]
}

In [None]:
%%time
gs = GridSearchCV(rf, param_grid=rf_params,
                  n_jobs = -1,
                  cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
print(gs.score(X_train, y_train))
print(gs.score(X_test, y_test))

##### using the best hyperparametres from randomsearchcv, gridsearchcv results in same accuracy score

In [None]:
gs.best_estimator_.feature_importances_[1:5]

In [None]:
X_train_df = pd.DataFrame(X_train, 
                          columns=cv.get_feature_names())

In [None]:
X_train_df.columns

In [None]:
df1 = pd.DataFrame({'feature_names':X_train_df.columns,
                   'feature_importance':gs.best_estimator_.feature_importances_})

#Sort the DataFrame in order decreasing feature importance
df1.sort_values(by=['feature_importance'], ascending=False,inplace=True)

In [None]:
df1.set_index('feature_names', inplace = True)

In [None]:
df1.sort_values(by = 'feature_importance', ascending = True).tail(10).plot(kind = 'barh', color = 'teal',
                                                                            edgecolor = 'black',
                                                                          figsize = (6,3))
plt.title("Top Ten Features")
plt.ylabel('Feature Names')
plt.savefig('./images/topfeatures.png');

In [None]:
df1.to_csv('./data/topfeatures..csv')

In [None]:
df1.head(10)

In [None]:
df['corpus'] = corpus

In [None]:
X = df['corpus']
y = df['target']

In [None]:
lr = LogisticRegression()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, 
                                                   random_state = 42, stratify = y)

In [None]:
#creating corpus for test data for predictions
corpus_test = []
for i in range(0, len(df_test)):
    review = re.sub(r"http\S+", "", df_test['text'][i])
    review = re.sub('[^a-zA-Z\d+]', ' ', review)
    review = re.sub('[0-9]', '', review)
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word, pos = 'v') for word in review if not word in stopwords.words('english')]
    review = [lemmatizer.lemmatize(word, pos = 'n') for word in review]
    review = [lemmatizer.lemmatize(word, pos = 'a') for word in review]
    review = ' '.join(review)
    corpus_test.append(review)

In [None]:
pipe1 = Pipeline([
    ('tf', TfidfVectorizer(max_features = 1000, ngram_range = (1,2), binary = True)),
    ('lr_cv', LogisticRegression(C = 1.5))
])

In [None]:
pipe1.fit(X_train, y_train)

In [None]:
print(f'Train Accuracy score using TFIDF Logistic Regression: {round(pipe1.score(X_train, y_train),3)}')
print(f'Test Accuracy score using TFIDF Logistic Regression: {round(pipe1.score(X_test, y_test),3)}')

In [None]:
with open('models/logistic_tfidf.pkl', 'wb') as f:
    pickle.dump(pipe1, f)

In [None]:
corpus_test[0]

In [None]:
#Predicting on test set
pipe1.predict(corpus_test)[0]

In [None]:
plot_confusion_matrix(pipe1, X_test, y_test, cmap = 'Blues')
plt.title("Confusion Matrix using Logistic Regression with 1.5 Penalty");

##### The Logistic Regression model  using TFIDF vectorizer with 100 max features, ngram range of 1,2, l2 penalty with a regularization strength of 1.5 performed similar to our other models and predicted 172 False Positives, 286 False Negatives

In [None]:
y_predlr = pipe1.predict(X_test)
y_predlr

In [None]:
#Checking whether various thresholds can make a difference to our model
my_threshold = 0.6
y_preds_tr2 = (y_predlr > my_threshold)
confusion_matrix(y_test, y_preds_tr2)

##### There is no difference in prediction after changing the threshold to 0.6 and above

In [None]:
#Checking whether various thresholds can make a difference to our model
my_threshold = 0.6
y_preds_tr2 = (y_predlr < my_threshold)
confusion_matrix(y_test, y_preds_tr2)

##### Changing the threshold to 0.6 and below effected the model and it completely reversed the model predictions. It appears that changing the thresholds does not help our model predictions

In [None]:
pipe1.predict_proba(corpus_test)

In [None]:
corpus_test[0]

In [None]:
pipe1.predict(corpus_test)[0]

##### It appears that our above model predicts "happen terrible car crash" as real Disaster

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3,
                                                   random_state = 42)

In [None]:
#Creating a Pipeline with XGBoost Classifier
pipe2 = Pipeline([
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier())
])

In [None]:
pipe2_params = {
    'cvec__max_features': [500, 1000, 2_000, 3_000],
    'cvec__min_df': [0,1],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1, 1), (1, 2)],
    'xgb__n_estimators': [100, 200, 300, 600],
    'xgb__max_depth': [None, 1, 2, 3]
    
}

# Instantiate GridSearchCV.

gs1= GridSearchCV(pipe2, 
                  n_jobs = -1,
                  param_grid = pipe2_params,
                  cv = 5)

In [None]:
%%time
gs1.fit(X_train, y_train)
print(f'Training Accuracy Score using XGBoost Classifier is : {round(gs1.score(X_train, y_train),2)}')
print("----------------")
print(f'Testing Accuracy Score using XGBoost Classifier is : {round(gs1.score(X_test, y_test),2)}')


In [None]:
#Best Estimator
gs1.best_estimator_

##### max_df is the upper ceiling value of the frequency values, while min_df is just the lower cutoff value of the frequency values. If we want to remove more common words, we set max_df to a lower ceiling value between 0 and 1. If we want to remove more rare words, we set min_df to a higher cutoff value between 0 and 1

In [None]:
#Instantiating Countervectorizer
tf = TfidfVectorizer(max_df=0.9, max_features=1000, min_df = 1,
                                 ngram_range=(1, 2))
X = tf.fit_transform(corpus).toarray()
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, 
                                                    random_state = 42,
                                                   stratify = y)

##### After each boosting step, we can directly get the weights of new features, and 'eta' shrinks the feature weights to make the boosting process more conservative. The larger gamma is, the more conservative the algorithm will be.range: [0,∞]. Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees.

In [None]:
xgb = XGBClassifier(n_estimators = 2000, eta = 0.3, gamma = 5, max_depth = 8, subsample = 0.5)
xgb.fit(X_train, y_train)

In [None]:
print(round(xgb.score(X_train, y_train),3))
print(round(xgb.score(X_test, y_test),3))

##### The XGBoost Classifier with TFIDF Vectorizer has a high accuracy score of 86% and a test accuracy of 78% and it is clear from the scores that the model is overfitting

In [None]:
#Creating a Dataframe to get feature names
X_train_df1 = pd.DataFrame(X_train, 
                          columns= tf.get_feature_names())

In [None]:
df2 = pd.DataFrame({'feature_names':X_train_df1.columns,
                   'feature_importance':xgb.feature_importances_})

#Sort the DataFrame in order decreasing feature importance
df2.sort_values(by=['feature_importance'], ascending=False,inplace=True)
df2.head()

In [None]:
#Setting feature names as index to the dataframe
df2.set_index('feature_names', inplace = True)

In [None]:
df2.sort_values(by = 'feature_importance', ascending = True).tail(10).plot(kind = 'barh', 
                                                                           edgecolor = 'black',
                                                                          figsize = (7,5))
plt.ylabel('Feature Names')
plt.title("Top 10 Features using XGBoost Classifier");

In [None]:
with open('models/xgboost_cv', 'wb') as f:
    pickle.dump(xgb, f)

In [None]:
#Confusion matrix plot
plot_confusion_matrix(xgb,X_test, y_test, cmap = 'Blues')
plt.title("Confusion Matrix using XGBoost Classifier");

In [None]:
X = tf.fit_transform(corpus).toarray()
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, 
                                                    random_state = 42,
                                                   stratify = y)

In [None]:
#Logistic Regression with l1 penalty
lrcv = LogisticRegression(solver = 'newton-cg', penalty = 'l2', C = 1)
lrcv.fit(X_train, y_train)
print(f'Training Score: {round(lrcv.score(X_train, y_train),2)}')
print(f'Testing SCore: {round(lrcv.score(X_test, y_test),2)}')

##### The logistic regression model with newton-cg solver, l2 penalty with a regularization of 1 with TFIDF vectorizer has accuracy similar to previous models 

In [None]:
#Generating a confusion matrix plot using the logisticregression with l1 penalty
plot_confusion_matrix(lrcv, X_test, y_test, cmap = 'Blues')
plt.title("Confusion Matrix using Logistic Regression with newton-cg");

In [None]:
#Logistic Regression with l1 penalty
lrcv1 = LogisticRegression(solver = 'liblinear', penalty = 'l1', C = 1.5)
lrcv1.fit(X_train, y_train)
print(f'Accuracy Train score using Logistic Regression with L1 penalty is :{round(lrcv1.score(X_train, y_train),2)}')
print('---------------')
print(f'Accuracy Test score using Logistic Regression with L1 penalty is :{round(lrcv1.score(X_test, y_test),2)}')


In [None]:
#Generating a confusion matrix plot using the logisticregression with l1 penalty
plot_confusion_matrix(lrcv1, X_test, y_test, cmap = 'Blues')
plt.title("Confusion Matrix using Logistic Regression with L1 penalty");

##### We can see that there are 109 False positives and 334 False negatives in our predicted model, we can try to balance these errors by increasing or decreasing the thresholds

In [None]:
ypreds = lrcv1.predict(X_test)

In [None]:
sns.set_style('white')
fpr,tpr, thresholds = roc_curve(y_test, ypreds)
plt.plot(fpr,tpr, color = 'orange', lw = 2)
plt.plot([0, 1], [0, 1], lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title("ROC Curve of Real and Fake tweets using Logistic Regression l1 penalty")
plt.xlabel('False Possitive Rate(1-Specificity)')
plt.ylabel('True Possitive Rate(Sensitivity)')
plt.legend()
plt.grid(True)

In [None]:
print(classification_report(y_test, ypreds))

In [None]:
my_threshold = 0.6
y_preds_tr = (lrcv1.predict(X_test) > my_threshold).astype(int)
confusion_matrix(y_test, y_preds_tr)