# LOADING DEPENDENCIES

## <span style="color:red"> MUST RUN ALWAYS</span>


In [254]:
# Data Manipulation
import pandas as pd
import numpy as np
import random

# Checkpoints
import pickle
from tqdm.notebook import trange, tqdm

# Preprocessing
import re    # RegEx for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import sklearn.preprocessing as pr


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliorenteria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOADING DATASET

In [335]:
#file_name = './data/airline_tweets.csv'
#file_name = './data/covid_tweets.csv'
file_name = './data/generic_tweets.csv'

data = pd.read_csv(file_name)


In [336]:
# Asinging a random number to each observation for future sampling
np.random.seed(11)
data['rand'] = np.random.random_sample(size=(data.shape[0]))

In [337]:
data.head(5)

Unnamed: 0,text,sentiment,rand
0,Thanks for pointing out the crucial problems @...,1,0.18027
1,please ignore cheesey music,-1,0.019475
2,just got home from a meeting with the girls......,0,0.463219
3,victory for the bulldogs was celebrated by 3 w...,-1,0.724934
4,http://tinyurl.com/ateltl &lt;-- Rocksteady pt II,0,0.420204


In [338]:
# Sampling the min ammount from the other to to obtain a balanced dataset

samp_neg = min(data.sentiment.value_counts())/data.sentiment.value_counts()[-1]
samp_neu = min(data.sentiment.value_counts())/data.sentiment.value_counts()[0]
samp_pos = min(data.sentiment.value_counts())/data.sentiment.value_counts()[1]

print('NEGATIVA - Total: ',data.sentiment.value_counts()[-1], ', Proportion: ',samp_neg)
print('NEUTRALES - Total: ',data.sentiment.value_counts()[0], ', Proportion: ',samp_neu)
print('POSITIVE - Total: ',data.sentiment.value_counts()[1], ', Proportion: ',samp_pos)

NEGATIVA - Total:  8830 , Proportion:  1.0
NEUTRALES - Total:  12668 , Proportion:  0.6970318913798548
POSITIVE - Total:  9897 , Proportion:  0.8921895523896131


In [339]:
# Concatenating the sampled datasets for Negative, Neutral and Positive
frames = [data.loc[(data['sentiment'] == -1) & (data['rand'] <= samp_neg)],
data.loc[(data['sentiment'] == 0) & (data['rand'] <= samp_neu)],
data.loc[(data['sentiment'] == 1) & (data['rand'] <= samp_pos)]]

data = pd.concat(frames)
data = data.reset_index(drop=True)
print(data.sentiment.value_counts())
data.head()

 0    8865
 1    8855
-1    8830
Name: sentiment, dtype: int64


Unnamed: 0,text,sentiment,rand
0,please ignore cheesey music,-1,0.019475
1,victory for the bulldogs was celebrated by 3 w...,-1,0.724934
2,@MrWize sike sike call it truce???????????????...,-1,0.344624
3,Waiting for an email that will probably never ...,-1,0.318799
4,@chaoscartel That is annoying. What gear is it...,-1,0.055674


# TOKENIZING

Separate tweets into lists of word (or components) that carry meanings.

Removing/converting the following elements:
- Uppercase
- Urls
- Simbols
- Numbers
- English Stopwords
- Stemming

In [340]:
# Function to format, tokenize and remove stopwords from tweets.
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove tweeter users and hashtags ( @xxx, #xxx )
    text = re.sub(r"[@#]\w+", " ", text)
    # remove https
    text = re.sub(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)", " ", text)
    #text = " ".join(filter(lambda w: w.find("https://") == -1, text.split(" ")))
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #remove numbers
    text = re.sub(r"[0-9]*", "", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words


In [341]:
# Testing tweet_to_words Function
sel = 16003
print("\nORIGINAL TWEET ->\n", data['text'][sel])
print("\nPROCESSED TWEET ->", tweet_to_words(data['text'][sel]))
print("\SENTIMENT -> ", data['sentiment'][sel])


ORIGINAL TWEET ->
 FJGKFLD;'Sdh WHY AM I NOT AT HOMETOWN DAYS WITH MY FRIENDS.

PROCESSED TWEET -> ['fjgkfld', 'sdh', 'hometown', 'day', 'friend']
\SENTIMENT ->  0


In [342]:
# Apply tokenization to each tweet and store as X
X = []
for i in trange(len(data['text'])):
    X.append(tweet_to_words(data['text'][i]))

HBox(children=(FloatProgress(value=0.0, max=26550.0), HTML(value='')))




In [343]:
# Encode target labels
le = LabelEncoder()
Y = le.fit_transform(data['sentiment'])


In [344]:
# Apply tokenization to each tweet and store as X
# X = list(map(tweet_to_words, data['text']))


## 🚩 CHECKPOINT 

Saving **Tokenized List** as a Pickle File to retrieve latter and save memory and time.

In [346]:
# Saving X to a PICKLE to retrieve back latter:

pickle_dic = {'df_1245.csv':'1245_',
              'airline_tweets.csv':'airlines_',
              'covid_tweets.csv':'covid_',
              'generic_tweets.csv':'generic_'}

# Code to Save PICKLE
#with open('./tokenized_data/'+pickle_dic[file_name]+'X.pkl', 'wb') as f:
#    pickle.dump(X, f)
#with open('./tokenized_data/'+pickle_dic[file_name]+'Y.pkl', 'wb') as f:
#    pickle.dump(Y, f)


In [347]:
# Code to Retrieve PICKLE
#with open('./tokenized_data/'+pickle_dic[file_name]+'X.pkl', 'rb') as f:
#    X = pickle.load(f)
#with open('./tokenized_data/'+pickle_dic[file_name]+'Y.pkl', 'rb') as f:
#    Y = pickle.load(f)


# TRAIN TEST SPLIT

In [348]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=13)

print('Number of tweets in the total set :    {}'.format(len(X)))
print('Number of tweets in the training set : {}'.format(len(X_train)))
print('Number of tweets in the testing set :  {}'.format(len(X_test)))


Number of tweets in the total set :    26550
Number of tweets in the training set : 18585
Number of tweets in the testing set :  7965


In [349]:
# Counting the distribution in training and test datasets
unique_tr, counts_tr = np.unique(y_train, return_counts=True)
print('TRAINING DISTRIBUTION - ',dict(zip(unique_tr, counts_tr)))

unique_te, counts_te = np.unique(y_test, return_counts=True)
print('TESTING DISTRIBUTION - ',dict(zip(unique_te, counts_te)))

TRAINING DISTRIBUTION -  {0: 6143, 1: 6208, 2: 6234}
TESTING DISTRIBUTION -  {0: 2687, 1: 2657, 2: 2621}


# BAG OF WORDS


## Using CountVectorizer

In [350]:
# Convert a collection of text documents to a matrix of token counts
vocabulary_size = 1000    # Rounded up from 17422

# Generating Bag of Words
# Tweets have already been preprocessed hence dummy function will be passed in to preprocessor & tokenizer step
count_vector = CountVectorizer(max_features=vocabulary_size, preprocessor=lambda x: x, tokenizer=lambda x: x)
count_vector.fit(X_train)






CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=1000, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function <lambda> at 0x7f8708316510>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function <lambda> at 0x7f87083162f0>,
                vocabulary=None)

In [351]:
# CountVectorizer creates a vocabulary. Checking Vocabulary
print('COUNT VECTORIZER')
print('VOCABULARY SIZE: ', len(count_vector.vocabulary_))
print('VOCABULARY CONTENT: ', count_vector.vocabulary_)


COUNT VECTORIZER
VOCABULARY SIZE:  1000
VOCABULARY CONTENT:  {'like': 499, 'person': 638, 'right': 715, 'keep': 465, 'fall': 293, 'asleep': 49, 'suppos': 835, 'awak': 53, 'miss': 559, 'way': 941, 'school': 735, 'back': 61, 'later': 483, 'forget': 331, 'someth': 786, 'morn': 571, 'ran': 692, 'warm': 935, 'shop': 760, 'x': 980, 'happi': 391, 'good': 363, 'day': 214, 'fuck': 342, 'hate': 393, 'run': 722, 'special': 802, 'k': 464, 'garden': 348, 'thank': 857, 'super': 833, 'laptop': 480, 'screen': 736, 'excit': 280, 'week': 947, 'u': 905, 'go': 357, 'away': 54, 'stand': 807, 'next': 590, 'start': 810, 'video': 923, 'edit': 256, 'first': 319, 'movi': 574, 'ad': 6, 'updat': 910, 'know': 475, 'work': 969, 'pay': 631, 'though': 862, 'mother': 572, 'total': 882, 'finish': 317, 'til': 869, 'bad': 62, 'news': 589, 'graduat': 370, 'think': 860, 'im': 437, 'feel': 304, 'well': 951, 'tire': 873, 'scare': 733, 'cuz': 203, 'wake': 930, 'sick': 767, 'lose': 515, 'follow': 328, 'wonder': 965, 'end': 262

## Using TfidfVectorizer

In [352]:
# characters level tf-idf
tfidf_vect_ngram = TfidfVectorizer(max_df=0.90, min_df=2, analyzer='word', preprocessor=lambda x: x, tokenizer=lambda x: x,
                                   ngram_range=(1,2), max_features=1000)
tfidf_vect_ngram.fit(X_train)

X_train_tfidf_ngram =  tfidf_vect_ngram.transform(X_train) 
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(X_test) 




In [353]:
# TfidfVectorizer also creates a vocabulary. Checking Vocabulary
print('TF IDF VECTORIZER')
print('VOCABULARY SIZE: ', len(tfidf_vect_ngram.vocabulary_))
print('VOCABULARY CONTENT: ', tfidf_vect_ngram.vocabulary_)


TF IDF VECTORIZER
VOCABULARY SIZE:  1000
VOCABULARY CONTENT:  {'like': 500, 'person': 646, 'right': 719, 'keep': 464, 'fall': 272, 'asleep': 45, 'suppos': 830, 'awak': 49, 'miss': 565, 'way': 939, 'school': 736, 'back': 57, 'later': 483, 'forget': 306, 'someth': 784, 'morn': 576, 'ran': 694, 'warm': 932, 'shop': 758, 'x': 980, 'happi': 390, 'good': 354, 'day': 201, 'good day': 355, 'fuck': 318, 'hate': 395, 'run': 725, 'special': 799, 'k': 463, 'garden': 324, 'thank': 848, 'super': 828, 'laptop': 478, 'screen': 737, 'excit': 259, 'week': 945, 'u': 897, 'go': 337, 'away': 50, 'stand': 803, 'next': 599, 'start': 807, 'video': 914, 'first': 296, 'movi': 580, 'ad': 4, 'updat': 902, 'know': 473, 'work': 966, 'pay': 641, 'though': 856, 'mother': 577, 'happi mother': 392, 'mother day': 578, 'total': 876, 'finish': 295, 'til': 863, 'bad': 60, 'news': 598, 'graduat': 368, 'think': 854, 'im': 438, 'feel': 279, 'well': 949, 'tire': 867, 'scare': 735, 'cuz': 192, 'wake': 923, 'sick': 765, 'lose': 

In [354]:
# Transform the training data
X_train_bow = count_vector.transform(X_train).toarray()

# Transform the testing data
X_test_bow = count_vector.transform(X_test).toarray()


# NORMALIZING

In [355]:
# Normalize BoW features in training and test set
X_train_bow = pr.normalize(X_train_bow, axis=1)
X_test_bow  = pr.normalize(X_test_bow, axis=1)

X_train_tfidf_ngram = pr.normalize(X_train_tfidf_ngram, axis=1)
X_test_tfidf_ngram  = pr.normalize(X_test_tfidf_ngram, axis=1)

# MODELING

- Gradient Boosting
- Random Forest
- K Nearest Neighbors

In [357]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier



In [358]:
# GRADIENT BOOSTING FOR COUNT VECTORIZER
xgboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train_bow, y_train)

print('GRADIENT BOOSTING FOR COUNT VECTORIZER')
print(xgboost.score(X_test_bow, y_test))

GRADIENT BOOSTING FOR COUNT VECTORIZER
0.6675455116133082


In [362]:
# GRADIENT BOOSTING FOR TF-IDF
xgboost_tf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=13).fit(X_train_tfidf_ngram, y_train)

print('GRADIENT BOOSTING FOR TF-IDF')
print(xgboost_tf.score(X_test_tfidf_ngram, y_test))

GRADIENT BOOSTING FOR TF-IDF
0.6843691148775894


In [363]:
# RANDOM FOREST FOR COUNT VECTORIZER
Ran_For = RandomForestClassifier(max_depth=2, n_estimators=100, criterion='entropy')
Ran_For.fit(X_train_bow, y_train)

print('RANDOM FOREST FOR COUNT VECTORIZER')
print(Ran_For.score(X_test_bow, y_test))

RANDOM FOREST FOR COUNT VECTORIZER
0.6288763339610797


In [364]:
# RANDOM FOREST FOR TF-IDF
Ran_For_tf = RandomForestClassifier(max_depth=2, n_estimators=100, criterion='entropy')
Ran_For_tf.fit(X_train_tfidf_ngram, y_train)

print('RANDOM FOREST FOR TF-IDF')
print(Ran_For_tf.score(X_test_tfidf_ngram, y_test))

RANDOM FOREST FOR TF-IDF
0.6018832391713748


In [365]:
# KNN FOR COUNT VECTORIZER
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_bow, y_train)

print('KNN FOR COUNT VECTORIZER')
print(neigh.score(X_test_bow, y_test))

KNN FOR COUNT VECTORIZER
0.5249215317011927


### Comparizon

In [372]:
sel = 6
print('Test    -',y_test[sel:sel+1],'\n')

print('XGB  CV -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('RaFo CV -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('K NN CV -',neigh.predict(X_test_bow[sel:sel+1]))

print('XGB  TF -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('RaFo TF -',Ran_For.predict(X_test_bow[sel:sel+1]))
print('K NN TF -',neigh.predict(X_test_bow[sel:sel+1]))

Test    - [0] 

XGB  CV - [1]
RaFo CV - [1]
K NN CV - [0]
XGB  TF - [1]
RaFo TF - [1]
K NN TF - [0]


{0: 5469, 1: 5334, 2: 5421}

0.5471167369901547

# Code to Save Models

In [373]:
# save the model to disk
filename = './saved_models/XGBoost_tf_Generic.sav'
pickle.dump(xgboost_tf, open(filename, 'wb'))



In [None]:
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)