# LOADING DEPENDENCIES

## <span style="color:red"> MUST RUN ALWAYS</span>


In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
import random

# Checkpoints
import pickle
import cloudpickle

from tqdm.notebook import trange, tqdm

# Preprocessing
import re    # RegEx for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier


import sklearn.preprocessing as pr


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliorenteria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOADING DATASET

In [2]:
#file_name = './data/airline_tweets.csv'
file_name = './data/covid_tweets.csv'
#file_name = './data/generic_tweets.csv'

data = pd.read_csv(file_name)


In [3]:
# Asinging a random number to each observation for future sampling
np.random.seed(11)
data['rand'] = np.random.random_sample(size=(data.shape[0]))

In [4]:
data.head(5)

Unnamed: 0,text,sentiment,rand
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,0,0.18027
1,advice Talk to your neighbours family to excha...,1,0.019475
2,Coronavirus Australia: Woolworths to give elde...,1,0.463219
3,My food stock is not the only one which is emp...,1,0.724934
4,"Me, ready to go at supermarket during the #COV...",-1,0.420204


In [5]:
# Sampling the min ammount from the other to to obtain a balanced dataset

samp_neg = min(data.sentiment.value_counts())/data.sentiment.value_counts()[-1]
samp_neu = min(data.sentiment.value_counts())/data.sentiment.value_counts()[0]
samp_pos = min(data.sentiment.value_counts())/data.sentiment.value_counts()[1]

print('NEGATIVA - Total: ',data.sentiment.value_counts()[-1], ', Proportion: ',samp_neg)
print('NEUTRALES - Total: ',data.sentiment.value_counts()[0], ', Proportion: ',samp_neu)
print('POSITIVE - Total: ',data.sentiment.value_counts()[1], ', Proportion: ',samp_pos)

NEGATIVA - Total:  15398 , Proportion:  0.5009092089881803
NEUTRALES - Total:  7713 , Proportion:  1.0
POSITIVE - Total:  18046 , Proportion:  0.42740773578632385


In [6]:
# Concatenating the sampled datasets for Negative, Neutral and Positive
frames = [data.loc[(data['sentiment'] == -1) & (data['rand'] <= samp_neg)],
data.loc[(data['sentiment'] == 0) & (data['rand'] <= samp_neu)],
data.loc[(data['sentiment'] == 1) & (data['rand'] <= samp_pos)]]

data = pd.concat(frames)
data = data.reset_index(drop=True)
print(data.sentiment.value_counts())
data.head()

-1    7755
 0    7713
 1    7710
Name: sentiment, dtype: int64


Unnamed: 0,text,sentiment,rand
0,"Me, ready to go at supermarket during the #COV...",-1,0.420204
1,@10DowningStreet @grantshapps what is being do...,-1,0.111661
2,Do you see malicious price increases in NYC? T...,-1,0.055674
3,@7SealsOfTheEnd Soon with dwindling supplies u...,-1,0.479797
4,There Is of in the Country The more empty she...,-1,0.401676


# TOKENIZING

Separate tweets into lists of word (or components) that carry meanings.

Removing/converting the following elements:
- Uppercase
- Urls
- Simbols
- Numbers
- English Stopwords
- Stemming

In [7]:
# Function to format, tokenize and remove stopwords from tweets.
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove tweeter users and hashtags ( @xxx, #xxx )
    text = re.sub(r"[@#]\w+", " ", text)
    # remove https
    text = re.sub(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)", " ", text)
    #text = " ".join(filter(lambda w: w.find("https://") == -1, text.split(" ")))
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #remove numbers
    text = re.sub(r"[0-9]*", "", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words


In [8]:
# Testing tweet_to_words Function
sel = 16003
print("\nORIGINAL TWEET ->\n", data['text'][sel])
print("\nPROCESSED TWEET ->", tweet_to_words(data['text'][sel]))
print("\SENTIMENT -> ", data['sentiment'][sel])


ORIGINAL TWEET ->
 One thing about it. Nielsen gonna get their consumer data https://t.co/kGX6wPbZdq

Might dive into this if I have time to help with brainstorming any opportunities here. Because marketing language will shift into these funnels. 

Want that? Text 'yes' to 903-689-1975

PROCESSED TWEET -> ['one', 'thing', 'nielsen', 'gonna', 'get', 'consum', 'data', 'might', 'dive', 'time', 'help', 'brainstorm', 'opportun', 'market', 'languag', 'shift', 'funnel', 'want', 'text', 'ye']
\SENTIMENT ->  1


In [9]:
# Apply tokenization to each tweet and store as X

X = []
for i in trange(len(data['text'])):
    X.append(tweet_to_words(data['text'][i]))

HBox(children=(FloatProgress(value=0.0, max=23178.0), HTML(value='')))




In [9]:
# Encode target labels
le = LabelEncoder()
Y = le.fit_transform(data['sentiment'])


In [10]:
list(le.inverse_transform([0, 1, 2]))

[-1, 0, 1]

In [11]:
# Apply tokenization to each tweet and store as X
# X = list(map(tweet_to_words, data['text']))


## 🚩 CHECKPOINT 

Saving **Tokenized List** as a Pickle File to retrieve latter and save memory and time.

In [346]:
# Saving X to a PICKLE to retrieve back latter:

pickle_dic = {'df_1245.csv':'1245_',
              'airline_tweets.csv':'airlines_',
              'covid_tweets.csv':'covid_',
              'generic_tweets.csv':'generic_'}

# Code to Save PICKLE
#with open('./tokenized_data/'+pickle_dic[file_name]+'X.pkl', 'wb') as f:
#    pickle.dump(X, f)
#with open('./tokenized_data/'+pickle_dic[file_name]+'Y.pkl', 'wb') as f:
#    pickle.dump(Y, f)


In [347]:
# Code to Retrieve PICKLE
#with open('./tokenized_data/'+pickle_dic[file_name]+'X.pkl', 'rb') as f:
#    X = pickle.load(f)
#with open('./tokenized_data/'+pickle_dic[file_name]+'Y.pkl', 'rb') as f:
#    Y = pickle.load(f)


# TRAIN TEST SPLIT

In [12]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=13)

print('Number of tweets in the total set :    {}'.format(len(X)))
print('Number of tweets in the training set : {}'.format(len(X_train)))
print('Number of tweets in the testing set :  {}'.format(len(X_test)))


Number of tweets in the total set :    23178
Number of tweets in the training set : 16224
Number of tweets in the testing set :  6954


In [13]:
# Counting the distribution in training and test datasets
unique_tr, counts_tr = np.unique(y_train, return_counts=True)
print('TRAINING DISTRIBUTION - ',dict(zip(unique_tr, counts_tr)))

unique_te, counts_te = np.unique(y_test, return_counts=True)
print('TESTING DISTRIBUTION - ',dict(zip(unique_te, counts_te)))

TRAINING DISTRIBUTION -  {0: 5469, 1: 5334, 2: 5421}
TESTING DISTRIBUTION -  {0: 2286, 1: 2379, 2: 2289}


# BAG OF WORDS


## Using CountVectorizer

In [52]:
# Convert a collection of text documents to a matrix of token counts
vocabulary_size = 1000    # Rounded up from 17422

# Generating Bag of Words
# Tweets have already been preprocessed hence dummy function will be passed in to preprocessor & tokenizer step
count_vector = CountVectorizer(max_features=vocabulary_size, preprocessor=lambda x: x, tokenizer=lambda x: x)
count_vector.fit(X_train)





CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=1000, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function <lambda> at 0x7f8cb91f47b8>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function <lambda> at 0x7f8cb91f4488>,
                vocabulary=None)

In [15]:
# CountVectorizer creates a vocabulary. Checking Vocabulary
print('COUNT VECTORIZER')
print('VOCABULARY SIZE: ', len(count_vector.vocabulary_))
print('VOCABULARY CONTENT: ', count_vector.vocabulary_)


COUNT VECTORIZER
VOCABULARY SIZE:  1000
VOCABULARY CONTENT:  {'food': 344, 'suppli': 858, 'consum': 179, 'depart': 227, 'govern': 384, 'take': 869, 'effect': 271, 'step': 837, 'prevent': 666, 'black': 96, 'market': 528, 'hoard': 420, 'protect': 679, 'mask': 529, 'hand': 398, 'sanit': 752, 'state': 834, 'group': 389, 'want': 956, 'insur': 453, 'compani': 170, 'help': 410, 'restaur': 730, 'industri': 444, 'taken': 870, 'massiv': 531, 'hit': 419, 'covid': 198, 'crisi': 203, 'pandem': 610, 'bank': 76, 'day': 216, 'payment': 623, 'period': 626, 'need': 568, 'due': 261, 'list': 496, 'credit': 202, 'would': 991, 'thought': 886, 'look': 509, 'like': 492, 'groceri': 388, 'store': 843, 'rather': 694, 'go': 374, 'purchas': 683, 'thing': 883, 'wonder': 982, 'today': 894, 'watch': 962, 'supermarket': 857, 'fight': 331, 'old': 590, 'ladi': 472, 'last': 475, 'roll': 743, 'toilet': 896, 'paper': 613, 'decis': 222, 'let': 488, 'peopl': 624, 'self': 768, 'isol': 457, 'get': 368, 'essenti': 294, 'stop': 

## Using TfidfVectorizer

In [16]:
# characters level tf-idf
tfidf_vect_ngram = TfidfVectorizer(max_df=0.90, min_df=2, analyzer='word', preprocessor=lambda x: x, tokenizer=lambda x: x,
                                   ngram_range=(1,2), max_features=1000)
tfidf_vect_ngram.fit(X_train)

X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train) 
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test) 




In [17]:
# TfidfVectorizer also creates a vocabulary. Checking Vocabulary
print('TF IDF VECTORIZER')
print('VOCABULARY SIZE: ', len(tfidf_vect_ngram.vocabulary_))
print('VOCABULARY CONTENT: ', tfidf_vect_ngram.vocabulary_)


TF IDF VECTORIZER
VOCABULARY SIZE:  1000
VOCABULARY CONTENT:  {'food': 329, 'suppli': 864, 'consum': 159, 'depart': 222, 'govern': 381, 'take': 874, 'effect': 260, 'step': 836, 'prevent': 665, 'market': 532, 'hoard': 423, 'protect': 679, 'mask': 533, 'hand': 397, 'sanit': 750, 'state': 831, 'food suppli': 337, 'hand sanit': 398, 'group': 388, 'want': 956, 'insur': 456, 'compani': 150, 'help': 411, 'restaur': 728, 'industri': 447, 'taken': 876, 'massiv': 536, 'hit': 422, 'covid': 188, 'crisi': 198, 'covid crisi': 190, 'pandem': 611, 'bank': 63, 'day': 209, 'payment': 626, 'period': 630, 'need': 569, 'due': 251, 'list': 498, 'credit': 197, 'would': 991, 'thought': 891, 'look': 512, 'like': 494, 'groceri': 385, 'store': 845, 'rather': 694, 'go': 368, 'purchas': 682, 'thing': 888, 'look like': 513, 'groceri store': 387, 'wonder': 982, 'today': 897, 'watch': 963, 'supermarket': 859, 'fight': 315, 'old': 592, 'ladi': 474, 'last': 477, 'roll': 741, 'toilet': 899, 'paper': 616, 'go supermarket

In [18]:
# Transform the training data
X_train_bow = count_vector.transform(X_train).toarray()

# Transform the testing data
X_test_bow = count_vector.transform(X_test).toarray()


# NORMALIZING

In [23]:
# Normalize BoW features in training and test set
bow_normalizer = pr.Normalizer().fit(X_train_bow)
tfidf_normalizer = pr.Normalizer().fit(X_train_tfidf_ngram)

X_train_bow = bow_normalizer.transform(X_train_bow)
X_test_bow  = bow_normalizer.transform(X_test_bow)

X_train_tfidf_ngram = bow_normalizer.transform(X_train_tfidf_ngram)
X_test_tfidf_ngram = bow_normalizer.transform(X_test_tfidf_ngram)


# MODELING

- Gradient Boosting
- Random Forest
- K Nearest Neighbors

In [24]:
# GRADIENT BOOSTING FOR COUNT VECTORIZER
xgboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train_bow, y_train)

print('GRADIENT BOOSTING FOR COUNT VECTORIZER')
print(xgboost.score(X_test_bow, y_test))

GRADIENT BOOSTING FOR COUNT VECTORIZER
0.7467644521138913


In [34]:
# GRADIENT BOOSTING FOR TF-IDF
xgboost_tf = GradientBoostingClassifier(n_estimators=400, learning_rate=.5,
    max_depth=1, random_state=13).fit(X_train_tfidf_ngram, y_train)

print('GRADIENT BOOSTING FOR TF-IDF')
print(xgboost_tf.score(X_test_tfidf_ngram, y_test))

GRADIENT BOOSTING FOR TF-IDF
0.7391429393155019


In [26]:
# RANDOM FOREST FOR COUNT VECTORIZER
Ran_For = RandomForestClassifier(max_depth=2, n_estimators=100, criterion='entropy')
Ran_For.fit(X_train_bow, y_train)

print('RANDOM FOREST FOR COUNT VECTORIZER')
print(Ran_For.score(X_test_bow, y_test))

RANDOM FOREST FOR COUNT VECTORIZER
0.6199309749784296


In [27]:
# RANDOM FOREST FOR TF-IDF
Ran_For_tf = RandomForestClassifier(max_depth=2, n_estimators=100, criterion='entropy')
Ran_For_tf.fit(X_train_tfidf_ngram, y_train)

print('RANDOM FOREST FOR TF-IDF')
print(Ran_For_tf.score(X_test_tfidf_ngram, y_test))

RANDOM FOREST FOR TF-IDF
0.6084268047167098


In [28]:
# KNN FOR COUNT VECTORIZER
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_bow, y_train)

print('KNN FOR COUNT VECTORIZER')
print(neigh.score(X_test_bow, y_test))

KNN FOR COUNT VECTORIZER
0.41141788898475695


### Comparizon

In [372]:
sel = 6
print('Test    -',y_test[sel:sel+1],'\n')

print('XGB  CV -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('RaFo CV -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('K NN CV -',neigh.predict(X_test_bow[sel:sel+1]))

print('XGB  TF -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('RaFo TF -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('K NN TF -',neigh.predict(X_test_bow[sel:sel+1]))

Test    - [0] 

XGB  CV - [1]
RaFo CV - [1]
K NN CV - [0]
XGB  TF - [1]
RaFo TF - [1]
K NN TF - [0]


# Code to Save Models

In [54]:
# save the model to disk
model_data = {'model': xgboost, 'filename':'./saved_models/xgboost_covid.pkl'}
CV_data = {'model': count_vector, 'filename': './saved_models/CV_covid.pkl'}
TF_data = {'model': tfidf_vect_ngram, 'filename': './saved_models/TF_covid.pkl'}
CV_normalizer = {'model':bow_normalizer, 'filename': './saved_models/CV_normalizer.pkl'}
TF_normalizer = {'model':tfidf_normalizer, 'filename': './saved_models/TF_normalizer_1-2gram.pkl'}

package = [CV_data,model_data, TF_data, CV_normalizer, TF_normalizer]

for i in package:
    with open(i['filename'], 'wb') as f:
        cloudpickle.dump(i['model'], f)
    print(i['filename'])


./saved_models/CV_covid.pkl
./saved_models/xgboost_covid.pkl
./saved_models/TF_covid.pkl
./saved_models/CV_normalizer.pkl
./saved_models/TF_normalizer_1-2gram.pkl


sklearn.feature_extraction.text.CountVectorizer

In [None]:
# some time later...
 
# load the model from disk
#loaded_model = cloudpickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)