In [None]:
## load packages
import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import itertools
import collections
import random
import math
import time

import urllib
import urllib.request
import re
import string

%pip install newspaper3k
import newspaper
import os

%pip install tweepy
import tweepy as tw
from bs4 import BeautifulSoup
import tqdm

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB  # maybe compare performance with NLTK's, if I can get it to work
from sklearn.svm import SVC

import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")



# set max display of rows to 100
pd.set_option('display.max_rows', 120)

# show all columns
pd.set_option('max_columns', None)

In [None]:
%pwd

In [None]:
## Reading in our intital disaster tweets dataset into Jupyter NB ennvironment for analysis ;

disaster_tweets = pd.read_csv("tweets.csv")

In [None]:
## Finding the number of rows / columns witihin our dataset read in ;

disaster_tweets.shape

In [None]:
## Assessing if any of the columns in our dataset has missing / null values as a means of initial 
## data exploration ;

disaster_tweets.isnull().sum()

In [None]:
## Checking out our dataset in dataframe format to visualize and plan for the analysis ;

disaster_tweets

In [None]:
## Closer examination of the keyword column - we observe that two-word keywords are seperated 
## by '%20', which we'll have to get rid off in our data cleaning / exploration process ;

disaster_tweets.keyword.unique()

In [None]:
# Further examination of the keyword column within our dataset ;

disaster_tweets.keyword.value_counts()

In [None]:
## Devising a simple function to clean up the keyword column, replacing the '%20' in between
## two-word keywords with a single splace ;

def keyword_clean(text):
    
    x = re.sub('%20', ' ', text)
    
    return x

In [None]:
## Application of our function created above to the keyword column in our dataset to clean it up ;

disaster_tweets['keyword'] = disaster_tweets['keyword'].apply(lambda x: keyword_clean(x))

In [None]:
## Checking our keyword column cleaning exercise to make sure it worked as planned ;

disaster_tweets.keyword.unique()

In [None]:
## Creating dummy variables from our keyword column which are numerical to be used for 
## our impending modelling analysis ;

df_dummies = pd.get_dummies(disaster_tweets, prefix='', prefix_sep='', columns=['keyword'])

In [None]:
df_dummies.head(10)

In [None]:
## Creating an all-purpose function to be used for getting our tweet text column ready to be 
## applied to CountVectorizer in order to generate even more numerical features for our 
## impending modelling analysis ;

In [None]:
wn = nltk.WordNetLemmatizer()

stopword = nltk.corpus.stopwords.words('english')

def clean_text(text):
    
    text_lc = "".join([word.lower() for word in text if word not in string.punctuation]) 

    text_rc = re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)', '', text_lc)
    
    tokens = re.split('\W+', text_rc) 
    
    text = [wn.lemmatize(word) for word in tokens if word.isalpha() and word not in stopword]
    
    return text

In [None]:
## Applying Countvectorizer ;

countVectorizer = CountVectorizer(analyzer = clean_text) 

countVector = countVectorizer.fit_transform(disaster_tweets['text'])

print('{} Number of tweets has {} words'.format(countVector.shape[0], countVector.shape[1]))

In [None]:
count_vect_df = pd.DataFrame(countVector.toarray(), columns = countVectorizer.get_feature_names())

count_vect_df.head(10)

In [None]:
count_vect_df.shape

In [None]:
## Concatenating our vectorized dataset from the keyword and text columns together to be used
## as features in the modelling analysis ;

df = pd.concat([df_dummies, count_vect_df], axis = 1)

In [None]:
df.head(10)

In [None]:
## split data into train / validation sets ;

train_data, val_data = train_test_split(df, train_size = 0.7, random_state = 0)

In [None]:
train_data.head()

In [None]:
## split the target variable out from the train and validation datasets and drop unneeded columns / variables ;

y_train = train_data.target
x_train = train_data.drop(columns = ["target", "location", "text"])

y_val = val_data.target
x_val = val_data.drop(columns = ["target", "location", "text"])

In [None]:
## DO NOT RUN ..
## standardizing the columns in the train and validation datasets (takes forever to run, likely omitting) ;

#for field in x_train.columns:
    #standard_dev = x_train[field].std()
    #mean = x_train[field].mean()
    
    #x_train[field] = (x_train[field] - mean) / standard_dev
    #x_val[field] = (x_val[field] - mean) / standard_dev
    

In [None]:
## Obtaining the number of rows / columns in our train data ;

train_data.shape

In [None]:
## Obtaining the number of rows / columns in our x_train data ;

x_train.shape

In [None]:
## Obtaining the number of rows / columns in our y_train data - of note, this data has 2 target columns 
## because of our concatenation exercsise above to combine the created dummies from the keyword to the
## features generated from the text column using countvectorizer. Thererfore, we will have to drop
## one (1) of those target columns since they are identical ;

y_train.shape

In [None]:
y_train.head()

In [None]:
## Dropping one of the target columns in our y_train data ;

y_train = y_train.iloc[:,:-1]

In [None]:
y_train.head()

In [None]:
## Doing the same exercise for our y_val data as our y_train data above of dropping 
## one of the duplicated identical target columns ;

y_val.head()

In [None]:
y_val = y_val.iloc[:,:-1]

In [None]:
y_val.head()

## Fit Logistic Regression Model ;

In [None]:
logit_model = LogisticRegression(random_state = 3).fit(x_train, y_train)

In [None]:
logit_model.predict_proba(x_train)

In [None]:
y_train_pred = logit_model.predict(x_train)

y_train_pred

In [None]:
y_val_pred = logit_model.predict(x_val)

y_val_pred

## Calculation of the Evaluation Metrics for the LogReg model ;

In [None]:
## Accuracy for train ;

metrics.accuracy_score(y_train, y_train_pred)

In [None]:
## Accuracy for validation ;

metrics.accuracy_score(y_val, y_val_pred)

In [None]:
## Precision for train ;

metrics.precision_score(y_train, y_train_pred)

In [None]:
## Precision for validation ;

metrics.precision_score(y_val, y_val_pred)

In [None]:
## Recall for train ;

metrics.recall_score(y_train, y_train_pred)

In [None]:
## Recall for validation ;

metrics.recall_score(y_val, y_val_pred)

In [None]:
## F1-score for train ;

metrics.f1_score(y_train, y_train_pred)

In [None]:
## F1-score for validation ;

metrics.f1_score(y_val, y_val_pred)

In [None]:
## Confusion Matrix for train ;

metrics.confusion_matrix(y_train, y_train_pred)

In [None]:
## Confusion Matrix for validation ;

metrics.confusion_matrix(y_val, y_val_pred)

## Fit Random Forest Model ;

In [None]:
forest_model = RandomForestClassifier(n_estimators = 200, max_depth = 3, 
                                      min_samples_split = 50, min_samples_leaf = 10,
                                      max_samples = 0.2, random_state = 0).fit(x_train, y_train)

In [None]:
## Get feature importances, which is a normalized measure of the average Gini reduction across the trees in the forest ;

forest_model.feature_importances_

In [None]:
## Print a dictionary that maps each feature to its importance level - this is normalized
## The higher the feature importance, the more important the feature is considered in the model

{var : imp for var, imp in zip(x_train.columns, forest_model.feature_importances_)}

In [None]:
## Test train data predicted probabilties ;

y_train_prob = forest_model.predict_proba(x_train)

y_train_prob

In [None]:
## Test validation data predicted probabilities ;

y_val_prob = forest_model.predict_proba(x_val)

y_val_prob

In [None]:
y_train_pred2 = forest_model.predict(x_train)

y_train_pred2

In [None]:
y_val_pred2 = forest_model.predict(x_val)

y_val_pred2

In [None]:
## Calculation of the Evaluation Metrics for the Random Forest model ;

In [None]:
## Accuracy for train ;

metrics.accuracy_score(y_train, y_train_pred2)

In [None]:
## Accuracy for validation ;

metrics.accuracy_score(y_val, y_val_pred2)

In [None]:
## Precision for train ;

metrics.precision_score(y_train, y_train_pred2)

In [None]:
## Precision for validation ;

metrics.precision_score(y_val, y_val_pred2)

In [None]:
## Recall for train ;

metrics.recall_score(y_train, y_train_pred2)

In [None]:
## Recall for validation ;

metrics.recall_score(y_val, y_val_pred2)

In [None]:
## F1-score for train ;

metrics.f1_score(y_train, y_train_pred2)

In [None]:
## F1-score for validation ;

metrics.f1_score(y_val, y_val_pred2)

In [None]:
## Confusion Matrix for train ;

metrics.confusion_matrix(y_train, y_train_pred2)

In [None]:
## Confusion Matrix for validation ;

metrics.confusion_matrix(y_val, y_val_pred2)

In [None]:
## AUC on train ;

fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)

metrics.auc(fpr, tpr)

In [None]:
# AUC on validation ;

fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)

metrics.auc(fpr, tpr)

## Going an extra step to test both the Logistic Regression and Random Forest models above against actual Twitter data from scraped tweets ;

In [None]:
## Setting our Access keys and tokens obtained from a twitter developer account ;

consumer_key = '9YIL2z1kPwLUxvjrnYwM9XMDI'

consumer_secret = 'Vqx0zTAruVpOuTZmeUETm4aMCMSzI65St4S2bNk1KajOA4CrCj'

access_token = '934099208-93iTijdRZkdCPcbudOpGDUzsvCwKSqcP3ObWgyrC'

access_token_secret = 'wG2UwDxjWFXiyHfFLMhj94164DkOul9sdVmphuokdQRxx'

In [None]:
## Assigning our authentication for scraping twitter with the API keys and tokens set forth above ;

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
## Scraping tweets from Twitter related to disasters with the search term / hashtag #coronavirus ;

search_term = '#coronavirus -filter:retweets'

tweets = tw.Cursor(api.search,
                   q = search_term,
                   lang = 'en',
                   since = '2019-10-01').items(3000)

tweets_data1 = [tweet.text for tweet in tweets]

tweets_data1[:20]

In [None]:
# Removing URL links and other ancillary elements from our tweets data obtained above using Regular Expressions ;

def remove_url(txt):
    
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())


tweets_data_no_urls1 = [remove_url(tweet) for tweet in tweets_data1]

tweets_data_no_urls1[:10]

In [None]:
## Scraping tweets from Twitter related to disasters with the search term / hashtag #bushfires ;

search_term = '#bushfires -filter:retweets'

tweets = tw.Cursor(api.search,
                   q = search_term,
                   lang = 'en',
                   since = '2019-10-01').items(3000)

tweets_data2 = [tweet.text for tweet in tweets]

tweets_data2[:20]

In [None]:
# Removing URL links and other ancillary elements from our tweets data obtained above using Regular Expressions ;

def remove_url(txt):
    
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())


tweets_data_no_urls2 = [remove_url(tweet) for tweet in tweets_data2]

tweets_data_no_urls2[:10]

In [None]:
## Scraping tweets from Twitter related to disasters with the search term / hashtag #volcanoeruption ;

search_term = '#volcanoeruption -filter:retweets'

tweets = tw.Cursor(api.search,
                   q = search_term,
                   lang = 'en',
                   since = '2019-10-01').items(3000)

tweets_data3 = [tweet.text for tweet in tweets]

tweets_data3[:20]

In [None]:
# Removing URL links and other ancillary elements from our tweets data obtained above using Regular Expressions ;

def remove_url(txt):
    
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())


tweets_data_no_urls3 = [remove_url(tweet) for tweet in tweets_data3]

tweets_data_no_urls3[:10]

In [None]:
## Creating a new dataset from the raw scraped tweets to be used for manual mapping to a target column ;

df5 = pd.DataFrame(data = zip(tweets_data1), 
                   columns = ['tweet'])

df5.shape

In [None]:
## Creating a new column - keyword, which is derived from the searchterm / hashtag from which the tweets 
## were scraped from Twitter ;

df5['keyword'] = "coronavirus"

In [None]:
## Creating a new dataset from the raw scraped tweets to be used for manual mapping to a target column ;

df6 = pd.DataFrame(data = zip(tweets_data2), 
                   columns = ['tweet'])

df6.shape

In [None]:
## Creating a new column - keyword, which is derived from the searchterm / hashtag from which the tweets 
## were scraped from Twitter ;

df6['keyword'] = "bushfires"

In [None]:
## Creating a new dataset from the raw scraped tweets to be used for manual mapping to a target column ;

df7 = pd.DataFrame(data = zip(tweets_data3), 
                   columns = ['tweet'])

df7.shape

In [None]:
## Creating a new column - keyword, which is derived from the searchterm / hashtag from which the tweets 
## were scraped from Twitter ;

df7['keyword'] = "volcanoeruption"

In [None]:
## Concatenating the three (3) sets of tweets data scraped into a single dataframe to be used for the test set ;

df_tweets_url = pd.concat([df5, df6, df7])

In [None]:
df_tweets_url.head(10)

In [None]:
df_tweets_url.tweet.values.tolist()


In [None]:
## Taking a random sample of 120 tweets from our created dataframe to be used for the manual mapping to a 
## target column for analysis ;

df_analysis_url = df_tweets_url.sample(120, random_state = 10)

In [None]:
## Generating a CSV file from our sampled datafram for analysis in order to manually map each tweet to 
## a target column, as either disaster - 1 or non-disaster - 0 ;

df_analysis_url.to_csv("test_tweets_url.csv")

In [None]:
%pwd

In [None]:
test_tweets = pd.read_csv("test_tweets_url_mapped.csv")

In [None]:
test_tweets.head()

In [None]:
## Dropping the ancillary first column 'Unnamed: 0' which was generated from the initial index of the 
## test_tweets_url CSV generated for the mapping ;

test_tweets = test_tweets.drop(columns = ["Unnamed: 0"])

test_tweets.head()

In [None]:


df_dummies_tweets = pd.get_dummies(test_tweets, prefix = '', prefix_sep = '', columns=['keyword'])

In [None]:
df_dummies_tweets.shape

In [None]:
df_dummies_tweets.head()

In [None]:
wn = nltk.WordNetLemmatizer()

stopword = nltk.corpus.stopwords.words('english')

def clean_text(text):
    
    text_lc = "".join([word.lower() for word in text if word not in string.punctuation]) 

    text_rc = re.sub('([^0-9A-Za-z \t])|(\w+:\/\/\S+)', '', text_lc)
    
    tokens = re.split('\W+', text_rc) 
    
    text = [wn.lemmatize(word) for word in tokens if word.isalpha() and word not in stopword]
    
    return text

In [None]:
## Applying Countvectorizer ;

countVectorizer = CountVectorizer(analyzer = clean_text) 

countVector = countVectorizer.fit_transform(test_tweets['tweet'])

print('{} Number of tweets has {} words'.format(countVector.shape[0], countVector.shape[1]))

In [None]:
tweets_count_vect_df = pd.DataFrame(countVector.toarray(), columns = countVectorizer.get_feature_names())

tweets_count_vect_df.head()

In [None]:
tweets_count_vect_df.shape

In [None]:
tweets_df = pd.concat([df_dummies_tweets, tweets_count_vect_df], axis = 1)

tweets_df.head()

In [None]:
y_test = tweets_df.target

x_test = tweets_df.drop(columns = ["target", "tweet"])

In [None]:
y_test.head()

In [None]:
x_test.head()

## Applying our Logistic Regression model to the scraped tweets test data ;

In [None]:
logit_model.predict_proba(x_test)

In [None]:
logit_model2 = LogisticRegression(random_state = 5).fit(x_test, y_test)

In [None]:
logit_model2.predict_proba(x_test)

In [None]:
y_test_pred = logit_model2.predict(x_test)

y_test_pred

In [None]:
## Accuracy for test ;

metrics.accuracy_score(y_test, y_test_pred)

In [None]:
## Precision for test ;

metrics.precision_score(y_test, y_test_pred)

In [None]:
## Recall for test ;

metrics.recall_score(y_test, y_test_pred)

In [None]:
## F1-score for test ;

metrics.f1_score(y_test, y_test_pred)

In [None]:
## Confusion Matrix for test ;

metrics.confusion_matrix(y_test, y_test_pred)

## Applying our Random Forest model to the scraped tweets test data ;

In [None]:
forest_model2 = RandomForestClassifier(n_estimators = 100, max_depth = 3, 
                                      min_samples_split = 50, min_samples_leaf = 10,
                                      max_samples = 0.2, random_state = 0).fit(x_test, y_test)

In [None]:
## Get feature importances, which is a normalized measure of the average Gini reduction across the trees in the forest ;

forest_model2.feature_importances_

In [None]:
## Print a dictionary that maps each feature to its importance level - this is normalized
## The higher the feature importance, the more important the feature is considered in the model

{var : imp for var, imp in zip(x_test.columns, forest_model2.feature_importances_)}

In [None]:
df_text = pd.DataFrame(disaster_tweets.text)

In [None]:
df_text.head(20)

In [None]:
## Removing Punctuation ;

def remove_punct(text):

    text = "".join([char for char in text if char not in string.punctuation])

    text = re.sub('[0–9]+', '', text)

    return text

df_text['punct'] = df_text['text'].apply(lambda x: remove_punct(x))


## Applying tokenization ;

def tokenization(text):

    text = re.split('\W+', text)

    return text

df_text['tokenized'] = df_text['punct'].apply(lambda x: tokenization(x.lower()))

## Removing stopwords ;

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):

    text = [word for word in text if word not in stopword]

    return text

df_text['nonstopwords'] = df_text['tokenized'].apply(lambda x: remove_stopwords(x))

## Applying Lemmatizer ;

wn = nltk.WordNetLemmatizer()

def lemma(text):

    text = [wn.lemmatize(word) for word in text]

    return text

df_text['lemmatized'] = df_text['nonstopwords'].apply(lambda x: lemma(x))


## df_text.head(20)

In [None]:
dtweets = pd.read_csv("tweets.csv", index_col = 0) # id same as index, use id column as index

In [None]:
dtweets.head()

In [None]:
# Null count
dtweets.isnull().sum()

In [None]:
# test code for removing links
sentence = ["The", "weather", "(https:dfsdfsdfs)", "has", "been", "good"]
sentence = [word for word in sentence if not re.search('^.?http.+$', word)]
sentence

In [None]:
dtweets.location.values.tolist()

In [None]:
# remove location column - too many missing and has invalid locations
del dtweets["location"]
dtweets.head()

In [None]:
dtweets['text'].dtype

In [None]:
# test code for including words with training '...', but not including links
sent = ["set", "ablaze..."]
[re.sub("(\.)+", "", word) for word in sent]

In [None]:
# write function to clean text column

def clean_text(document):    # document refers to an individual tweet
    tokens = nltk.tokenize.word_tokenize(document)
    # sub trailing '...' with " " - imperfect tokenization
    clean_tokens = [re.sub("(\.)+", "", word) for word in tokens]
    # remove non-alphanum chars such as emoticons and punctuations
    clean_tokens = [word.lower() for word in clean_tokens if word.isalnum()]
    stopwords = nltk.corpus.stopwords.words('english')
    clean_tokens = [word for word in clean_tokens if word not in stopwords] # remove stopwords
    # remove links 
    clean_tokens = [word for word in clean_tokens if not re.search('^.?http.+$', word)]
    # perform lemmatization
    wn = nltk.WordNetLemmatizer()
    clean_tokens = [wn.lemmatize(token) for token in clean_tokens]
    return clean_tokens

In [None]:
dtweets['text'] = dtweets['text'].map(clean_text)

In [None]:
dtweets['text'].iloc[11367]

### Data Exploration

In [None]:
# distribution of length of tweets for different targets
dtweet_yes = dtweets[dtweets["target"] == 1]
dtweet_no = dtweets[dtweets["target"] == 0]

fdisty = nltk.FreqDist(len(sent) for sent in dtweet_yes["text"])
fdistn = nltk.FreqDist(len(sent) for sent in dtweet_no["text"])

In [None]:
print(len(dtweet_yes))
fdisty.plot()

In [None]:
print(len(dtweet_no))
fdisty.plot()

In [None]:
# Can't see any relationship between tweet length and target variable

In [None]:
# inspecting first fews row to see keyword and text relationship
dtweets['text'].iloc[0]   # where is ablaze?
# keyword is a criteria used in search to retrieve tweets

In [None]:
# check unique keywords
dtweets["keyword"].unique()     #weird "%20" between keywords with two or more words

In [None]:
# write function to sub "%20" with " "

def clean_keyword(document):
    cleaned = re.sub("%20", " ", document)
    return cleaned

In [None]:
# map clean_keyword function to keyword column

dtweets['keyword'] = dtweets['keyword'].map(clean_keyword)

In [None]:
# create dummy variables for keyword column
dtweets = pd.get_dummies(dtweets, columns=["keyword"])

In [None]:
# inspect first few rows to see if combination was successful
dtweets.head(20)

### Vectorization

In [None]:
# Separate features (x) from label (y)
from sklearn import utils
dtweets = utils.shuffle(dtweets, random_state=123)
x = dtweets.drop(columns = ["target"])
y = dtweets["target"]
x

In [None]:
# create training and validation set (70-30)
x_train, x_val, y_train, y_val = train_test_split(x, y, train_size = 0.7, random_state = 123)

In [None]:
x_train

### Count Vectorizer a.k.a Bag of Words

In [None]:
def dummy(doc):
    return doc

count_vec = CountVectorizer(
    analyzer='word', tokenizer=dummy, preprocessor=dummy,
    token_pattern=None, ngram_range = (1,1), lowercase = False) 

# get transformed training data text column
train_count = count_vec.fit_transform(x_train["text"])
bow_train = pd.DataFrame(train_count.toarray(), columns = count_vec.get_feature_names())
bow_train.head()

In [None]:
# create word_count variable to reduce feature size to important words

In [None]:
word_counts = bow_train.sum()
word_counts = word_counts.sort_values(ascending = False)
word_counts.head()

In [None]:
reduced_bow_train = bow_train[word_counts[word_counts>=20].index] #trying 20 
reduced_bow_train

### Feature Set 1

In [None]:
# join reduced_bow_train to x_train (create feature set 1)
# unstandardized data

x_train1 = pd.concat([x_train.reset_index(drop = True), reduced_bow_train.reset_index(drop=True)], axis = 1)
x_train1

In [None]:
len(x_val)

In [None]:
# apply transformation to validation data

val_count = count_vec.transform(x_val['text'])
bow_val = pd.DataFrame(val_count.toarray(), columns = count_vec.get_feature_names())
reduced_bow_val = bow_val[reduced_bow_train.columns]
print(len(bow_val))

x_val1 = pd.concat([x_val.reset_index(drop = True), reduced_bow_val.reset_index(drop=True)], axis = 1)
x_val1

In [None]:
# drop 'text' column in x_train1 and x_val1

del x_train1["text"]
del x_val1["text"]
x_val1

### TF-IDF

In [None]:
# create TF-IDF vectorizer

tf_idf = TfidfVectorizer(
    analyzer='word', tokenizer=dummy, preprocessor=dummy,
    token_pattern=None, ngram_range = (1,1), lowercase = False) 

# get transformed training data text column
train_tf_idf = tf_idf.fit_transform(x_train["text"])
train_tf_idf = pd.DataFrame(train_tf_idf.toarray(), columns = tf_idf.get_feature_names())
train_tf_idf.head()

train_tf_idf

### Feature Set 2

In [None]:
# get reduced feature sets using word_counts
reduced_train_tfidf = train_tf_idf[word_counts[word_counts >= 20].index]
reduced_train_tfidf

In [None]:
x_train2 = pd.concat([x_train.reset_index(drop = True), reduced_train_tfidf.reset_index(drop=True)], axis = 1)
x_train2

In [None]:
# apply tf-idf transformation to validation dataset

val_tf_idf = tf_idf.transform(x_val["text"])
val_tf_idf = pd.DataFrame(val_tf_idf.toarray(), columns = tf_idf.get_feature_names())
reduced_val_tfidf = val_tf_idf[word_counts[word_counts >= 20].index]

x_val2 = pd.concat([x_val.reset_index(drop = True), reduced_val_tfidf.reset_index(drop=True)], axis = 1)
x_val2

In [None]:
# delete text column from x_train2 and x_val2

del x_train2["text"]
del x_val2['text']
x_val2.head()

### Word2Vec (Too many NAs - Don't use)

In [None]:
#from gensim.models import Word2Vec
#import time

In [None]:
#start = time.time()
#w2v = Word2Vec(x_train, vector_size = 300, window = 4)
#end = time.time()

#print(end-start)

In [None]:
# get average embeddings for each document

# create empty list to hold average embeddings each document
#avg_embeddings = []

#from tqdm import tqdm

# loop over each doc
#for index in tqdm(range(x_train.shape[0])):
    #doc = x_train["text"].iloc[index]
    #embeddings = [w2v.wv[word] for word in doc if word in w2v.wv]
    
    #if embeddings == []:
        #avg_embeddings.append([np.nan]*300)
    #else:
        #avg = np.mean(embeddings, axis = 0)
        #avg_embeddings.append(avg)

In [None]:
#avg_embeddings = [array if isinstance(array, list) else array.tolist() for array in avg_embeddings]

In [None]:
#w2v_features = pd.DataFrame(avg_embeddings)
#w2v_features

## Modeling

### Naive Bayes

#### Feature Set 1

In [None]:
# scikit-learn Naive Bayes

# Feature Set 1
from sklearn.naive_bayes import MultinomialNB

nb_classifier = MultinomialNB()
nb_model1 = nb_classifier.fit(x_train1, y_train)

In [None]:
y_pred_train = nb_model1.predict(x_train1)

In [None]:
# Training Set
metrics.confusion_matrix(y_train, y_pred_train)

In [None]:
# Performance on Training Set

print("Training Set Performance of nb_model1")
accuracy_train1 = metrics.accuracy_score(y_train, y_pred_train)
print("Accuracy score:", accuracy_train1)

precision_train1 = metrics.precision_score(y_train, y_pred_train)
print("Precision score:", precision_train1)

recall_train1 = metrics.recall_score(y_train, y_pred_train)
print("Recall score:", recall_train1)

f1_train1 = metrics.f1_score(y_train, y_pred_train)
print("F-1 score:", f1_train1)

In [None]:
y_train_prob = nb_model1.predict_proba(x_train1)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob[:,1], pos_label = 1)
print("AUC score:", metrics.auc(fpr, tpr))

In [None]:
y_pred_val = nb_model1.predict(x_val1)

In [None]:
# Performance on Validation Set

print("Validation Set Performance of nb_model1:")
accuracy_val1 = metrics.accuracy_score(y_val, y_pred_val)
print("Accuracy score:", accuracy_val1)

precision_val1 = metrics.precision_score(y_val, y_pred_val)
print("Precision score:", precision_val1)

recall_val1 = metrics.recall_score(y_val, y_pred_val)
print("Recall score:", recall_val1)

f1_val1 = metrics.f1_score(y_val, y_pred_val)
print("F-1 score:", f1_val1)

In [None]:
y_val_prob = nb_model1.predict_proba(x_val1)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob[:,1], pos_label = 1)
print("AUC score for validation set:", metrics.auc(fpr, tpr))

#### Feature Set 2

In [None]:
# Feature Set 2

nb_classifier = MultinomialNB()
nb_model2 = nb_classifier.fit(x_train2, y_train)

In [None]:
y_pred_train2 = nb_model1.predict(x_train2)

In [None]:
# Training Set
metrics.confusion_matrix(y_train, y_pred_train2)

In [None]:
# Performance on Training Set

print("Training Set Performance of nb_model2")
accuracy_train2 = metrics.accuracy_score(y_train, y_pred_train2)
print("Accuracy score:", accuracy_train2)

precision_train2 = metrics.precision_score(y_train, y_pred_train2)
print("Precision score:", precision_train2)

recall_train2 = metrics.recall_score(y_train, y_pred_train2)
print("Recall score:", recall_train2)

f1_train2 = metrics.f1_score(y_train, y_pred_train2)
print("F-1 score:", f1_train2)

In [None]:
y_train_prob2 = nb_model2.predict_proba(x_train2)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob2[:,1], pos_label = 1)
print("AUC score:", metrics.auc(fpr, tpr))

In [None]:
y_pred_val2 = nb_model2.predict(x_val2)

In [None]:
# Performance on Validation Set

print("Validation Set Performance of nb_model2:")
accuracy_val2 = metrics.accuracy_score(y_val, y_pred_val2)
print("Accuracy score:", accuracy_val2)

precision_val2 = metrics.precision_score(y_val, y_pred_val2)
print("Precision score:", precision_val2)

recall_val2 = metrics.recall_score(y_val, y_pred_val2)
print("Recall score:", recall_val2)

f1_val2 = metrics.f1_score(y_val, y_pred_val2)
print("F-1 score:", f1_val2)

In [None]:
y_val_prob2 = nb_model2.predict_proba(x_val2)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob2[:,1], pos_label = 1)
print("AUC score for validation set:", metrics.auc(fpr, tpr))

In [None]:
# nltk Naive Bayes

# convert dataframe to dictionary
# x_train1_dict = pd.DataFrame.to_dict(x_train1)
# x_train1_dict

In [None]:
# NBC1 = nltk.NaiveBayesClassifier.train(x_train1_dict)

### Extreme Gradient Boosting

In [None]:
# hyperparamater tuning using Randomized Search CV - interrupted kernal because it is taking a while

start = time.time()

parameters = {
        'max_depth': range(2, 6),
        'n_estimators': [50, 100, 150, 200, 250, 300],
        'subsample': [0.6, 0.7, 0.8],
        'colsample_bytree': [0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 1],
        'colsample_bynode': [0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 1],
        'gamma': [0, 5, 10, 15, 20],
        'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
        'lambda': [0.1, 0.25, 0.5, 0.75, 1]
}

classifier = RandomizedSearchCV(xgb.XGBClassifier(use_label_encoder = False, eval_metric = "logloss"), 
                         parameters, n_jobs=4, scoring = "roc_auc", n_iter = 300,
                         random_state = 123)

xgb_model1 = classifier.fit(x_train1, y_train)
xgb_model2 = classifier.fit(x_train2, y_train)

end = time.time()
print(end-start)

In [None]:
xgb_model1.best_params_

In [None]:
xgb_model2.best_params_

In [None]:
feature_imp = pd.DataFrame()
feature_imp["feature"] = x_train1.columns
feature_imp["importance"] = xgb_model1.best_estimator_.feature_importances_

feature_imp = feature_imp.sort_values("importance", ascending = False).reset_index(drop = True)
feature_imp

In [None]:
feature_imp = pd.DataFrame()
feature_imp["feature"] = x_train2.columns
feature_imp["importance"] = xgb_model2.best_estimator_.feature_importances_

feature_imp = feature_imp.sort_values("importance", ascending = False).reset_index(drop = True)
feature_imp

In [None]:
# Feature Set 1

y_train_prob3 = xgb_model1.predict_proba(x_train1)
y_val_prob3 = xgb_model1.predict_proba(x_val1)

# AUC

fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob3[:,1], pos_label = 1)
print("AUC for Feature Set 1 train set:", metrics.auc(fpr, tpr))

fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob3[:,1], pos_label = 1)
print("AUC for Feature Set 1 validation set:", metrics.auc(fpr, tpr))

In [None]:
# Examine xgb_model1's performance when we maximixe on F1-score

threshold = np.arange(0.01, 1, .01) # create thresholds

# create empty lists for each metric
precision_values = []
recall_values = []
acc_values = []
f1_values = []

# Loop through each threshold value
for value in threshold:
    
    # get 1 / 0 predictions based off probability threshold
    pred = [1 if prob >= value else 0 for prob in y_train_prob3[:,1]]
    
    # calculate precision, recall, accuracy, and f1-score
    precision = metrics.precision_score(y_train, pred)
    recall = metrics.recall_score(y_train, pred)
    accuracy = metrics.accuracy_score(y_train, pred)
    f1_score = metrics.f1_score(y_train, pred)
    
    # add precision, recall, accuracy, and f1-score to their respective lists
    precision_values.append(precision)
    recall_values.append(recall)
    acc_values.append(accuracy)
    f1_values.append(f1_score)

# put precision, recall, and accuracy values into a data frame
result = pd.DataFrame()
result["threshold"] = threshold
result["precision"] = precision_values
result["recall"] = recall_values
result["accuracy"] = acc_values
result["f1_score"] = f1_values

result.iloc[result.f1_score.idxmax()]

In [None]:
pred = [1 if prob >= 0.28 else 0 for prob in y_val_prob3[:,1]]

# calculate precision, recall, accuracy, and f1-score
precision = metrics.precision_score(y_val, pred)
recall = metrics.recall_score(y_val, pred)
accuracy = metrics.accuracy_score(y_val, pred)
f1_score = metrics.f1_score(y_val, pred)

print("Validation precision: ", precision)
print("Validation recall: ", recall)
print("Validation accuracy: ", accuracy)
print("Validation F1-Score: ", f1_score)

In [None]:
# Optimizing recall - precision, accuracy, etc. too low (don't want that either)

result.iloc[result.recall.idxmax()]

In [None]:
# Plot relationship between percision and recall score
plt.figure(figsize=(8, 8))
plt.title("Precision and Recall Scores vs. Probability Threshold")
plt.plot(result.threshold, result.precision, "b--", label="Precision")
plt.plot(result.threshold, result.recall, "g-", label="Recall")
plt.ylabel("Score")
plt.xlabel("Probability Threshold")
plt.legend(loc='best')

In [None]:
# automated threshold

y_train_pred3 = xgb_model1.predict(x_train1)
y_val_pred3 = xgb_model1.predict(x_val1)

precision = metrics.precision_score(y_train, y_train_pred3)
recall = metrics.recall_score(y_train, y_train_pred3)
accuracy = metrics.accuracy_score(y_train, y_train_pred3)
f1_score = metrics.f1_score(y_train, y_train_pred3)

print("xgb_model1 Performance on Train Set:")
print("Train precision: ", precision)
print("Train recall: ", recall)
print("Train accuracy: ", accuracy)
print("Train F1-Score: ", f1_score)

precision = metrics.precision_score(y_val, y_val_pred3)
recall = metrics.recall_score(y_val, y_val_pred3)
accuracy = metrics.accuracy_score(y_val, y_val_pred3)
f1_score = metrics.f1_score(y_val, y_val_pred3)

print("\n\nxgb_model1 Performance on Validation Set:")
print("Validation precision: ", precision)
print("Validation recall: ", recall)
print("Validation accuracy: ", accuracy)
print("Validation F1-Score: ", f1_score)

In [None]:
# Feature Set 2

y_train_prob4 = xgb_model2.predict_proba(x_train2)
y_val_prob4 = xgb_model2.predict_proba(x_val2)

# AUC

fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_prob4[:,1], pos_label = 1)
print("AUC for Feature Set 2 train set:", metrics.auc(fpr, tpr))

fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_prob4[:,1], pos_label = 1)
print("AUC for Feature Set 2 validation set:", metrics.auc(fpr, tpr))

In [None]:
# Examine xgb_model2's performance when we maximixe on F1-score

threshold = np.arange(0.01, 1, .01) # threshold values

# create empty lists for each metric
precision_values = []
recall_values = []
acc_values = []
f1_values = []

# Loop through each threshold values
for value in threshold:
    
    # get 1 / 0 predictions based off probability threshold
    pred = [1 if prob >= value else 0 for prob in y_train_prob4[:,1]]
    
    # calculate precision, recall, accuracy, and f1-score
    precision = metrics.precision_score(y_train, pred)
    recall = metrics.recall_score(y_train, pred)
    accuracy = metrics.accuracy_score(y_train, pred)
    f1_score = metrics.f1_score(y_train, pred)
    
    # add precision, recall, accuracy, and f1-score to their respective lists
    precision_values.append(precision)
    recall_values.append(recall)
    acc_values.append(accuracy)
    f1_values.append(f1_score)

# put precision, recall, and accuracy values into a data frame
result = pd.DataFrame()
result["threshold"] = threshold
result["Train precision"] = precision_values
result["Train recall"] = recall_values
result["Train accuracy"] = acc_values
result["Train f1_score"] = f1_values

result.iloc[result["Train f1_score"].idxmax()]

In [None]:
pred = [1 if prob >= 0.28 else 0 for prob in y_val_prob3[:,1]]

# calculate precision, recall, accuracy, and f1-score
precision = metrics.precision_score(y_val, pred)
recall = metrics.recall_score(y_val, pred)
accuracy = metrics.accuracy_score(y_val, pred)
f1_score = metrics.f1_score(y_val, pred)

print("Validation precision: ", precision)
print("Validation recall: ", recall)
print("Validation accuracy: ", accuracy)
print("Validation F1-Score: ", f1_score)

In [None]:
y_train_pred4 = xgb_model2.predict(x_train2)
y_val_pred4 = xgb_model2.predict(x_val2)

precision = metrics.precision_score(y_train, y_train_pred4)
recall = metrics.recall_score(y_train, y_train_pred4)
accuracy = metrics.accuracy_score(y_train, y_train_pred4)
f1_score = metrics.f1_score(y_train, y_train_pred4)

print("xgb_model2 Performance on Train Set:")
print("Train precision: ", precision)
print("Train recall: ", recall)
print("Train accuracy: ", accuracy)
print("Train F1-Score: ", f1_score)

precision = metrics.precision_score(y_val, y_val_pred4)
recall = metrics.recall_score(y_val, y_val_pred4)
accuracy = metrics.accuracy_score(y_val, y_val_pred4)
f1_score = metrics.f1_score(y_val, y_val_pred4)

print("\n\nxgb_model2 Performance on Validation Set:")
print("Validation precision: ", precision)
print("Validation recall: ", recall)
print("Validation accuracy: ", accuracy)
print("Validation F1-Score: ", f1_score)

In [None]:
# Observation: XGBoost models are overfitting a lot. We could perform dimensionality reduction
# using PCA if we had more time.