# LOADING DEPENDENCIES

## <span style="color:red"> MUST RUN ALWAYS</span>


In [91]:
# Data Manipulation
import pandas as pd
import numpy as np
import random

# Checkpoints
import pickle
from tqdm.notebook import trange, tqdm

# Preprocessing
import re    # RegEx for removing non-letter characters
import nltk  #natural language processing
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import sklearn.preprocessing as pr


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliorenteria/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# LOADING DATASET

In [89]:
file_name = 'airline_tweets.csv'
#file_name = 'covid_tweets.csv'
#file_name = 'generic_tweets.csv'

data = pd.read_csv(file_name)


In [137]:
# Asinging a random number to each observation for future sampling
np.random.seed(11)
data['rand'] = np.random.random_sample(size=(data.shape[0]))

In [138]:
data.head()

Unnamed: 0,text,sentiment,rand
0,@VirginAmerica What @dhepburn said.,0,0.18027
1,@VirginAmerica plus you've added commercials t...,1,0.019475
2,@VirginAmerica I didn't today... Must mean I n...,0,0.463219
3,@VirginAmerica it's really aggressive to blast...,-1,0.724934
4,@VirginAmerica and it's a really big bad thing...,-1,0.420204


In [139]:
# Sampling the min ammount from the other to to obtain a balanced dataset

samp_neg = min(data.sentiment.value_counts())/data.sentiment.value_counts()[-1]
samp_neu = min(data.sentiment.value_counts())/data.sentiment.value_counts()[0]
samp_pos = min(data.sentiment.value_counts())/data.sentiment.value_counts()[1]

print('POSITIVE - Total: ',data.sentiment.value_counts()[-1], ', Proportion: ',samp_neg)
print('POSITIVE - Total: ',data.sentiment.value_counts()[0], ', Proportion: ',samp_neu)
print('POSITIVE - Total: ',data.sentiment.value_counts()[1], ', Proportion: ',samp_pos)

POSITIVE - Total:  9178 , Proportion:  0.2574634996731314
POSITIVE - Total:  3099 , Proportion:  0.7625040335592127
POSITIVE - Total:  2363 , Proportion:  1.0


In [144]:
# Concatenating the sampled datasets for Negative, Neutral and Positive
frames = [data.loc[(data['sentiment'] == -1) & (data['rand'] <= samp_neg)],
data.loc[(data['sentiment'] == 0) & (data['rand'] <= samp_neu)],
data.loc[(data['sentiment'] == 1) & (data['rand'] <= samp_pos)]]

data = pd.concat(frames)
data = data.reset_index(drop=True)
print(data.sentiment.value_counts())
data.head()

-1    2397
 1    2363
 0    2350
Name: sentiment, dtype: int64


Unnamed: 0,text,sentiment,rand
0,@VirginAmerica I flew from NYC to SFO last we...,-1,0.116737
1,@VirginAmerica you guys messed up my seating.....,-1,0.111661
2,@VirginAmerica status match program. I applie...,-1,0.083953
3,@VirginAmerica amazing to me that we can't get...,-1,0.055674
4,@VirginAmerica hi! i'm so excited about your $...,-1,0.062888


# TOKENIZING

Separate tweets into lists of word (or components) that carry meanings.

Removing/converting the following elements:
- Uppercase
- Urls
- Simbols
- Numbers
- English Stopwords
- Stemming

In [54]:
# Function to format, tokenize and remove stopwords from tweets.
def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove tweeter users and hashtags ( @xxx, #xxx )
    text = re.sub(r"[@#]\w+", " ", text)
    # remove https
    text = re.sub(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)", " ", text)
    #text = " ".join(filter(lambda w: w.find("https://") == -1, text.split(" ")))
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #remove numbers
    text = re.sub(r"[0-9]*", "", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words


In [55]:
# Testing tweet_to_words Function
sel = 4
print("\nORIGINAL TWEET ->\n", data['text'][sel])
print("\nPROCESSED TWEET ->\n", tweet_to_words(data['text'][sel]))



ORIGINAL TWEET ->
 @VirginAmerica and it's a really big bad thing about it

PROCESSED TWEET ->
 ['realli', 'big', 'bad', 'thing']


In [56]:
# Apply tokenization to each tweet and store as X
X = []
for i in trange(len(data['text'])):
    X.append(tweet_to_words(data['text'][i]))

HBox(children=(FloatProgress(value=0.0, max=14640.0), HTML(value='')))




In [57]:
# Encode target labels
le = LabelEncoder()
Y = le.fit_transform(data['sentiment'])


In [58]:
# Apply tokenization to each tweet and store as X
# X = list(map(tweet_to_words, data['text']))


## 🚩 CHECKPOINT 

Saving **Tokenized List** as a Pickle File to retrieve latter and save memory and time.

In [59]:
# Saving X to a PICKLE to retrieve back latter:

pickle_dic = {'df_1245.csv':'1245_',
              'airline_tweets.csv':'airlines_',
              'covid_tweets.csv':'covid_',
              'generic_tweets.csv':'generic_'}

# Code to Save PICKLE
#with open(pickle_dic[file_name]+'X.pkl', 'wb') as f:
#    pickle.dump(X, f)
#with open(pickle_dic[file_name]+'Y.pkl', 'wb') as f:
#    pickle.dump(Y, f)


In [3]:
# Code to Retrieve PICKLE
with open(pickle_dic[file_name]+'X.pkl', 'rb') as f:
    X = pickle.load(f)
with open(pickle_dic[file_name]+'Y.pkl', 'rb') as f:
    Y = pickle.load(f)


# TRAIN TEST SPLIT

In [74]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=13)

print('Number of tweets in the total set :    {}'.format(len(X)))
print('Number of tweets in the training set : {}'.format(len(X_train)))
print('Number of tweets in the testing set :  {}'.format(len(X_test)))


Number of tweets in the total set :    14640
Number of tweets in the training set : 10248
Number of tweets in the testing set :  4392


In [76]:

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 6411, 1: 2172, 2: 1665}

# BAG OF WORDS


In [68]:
# Convert a collection of text documents to a matrix of token counts
vocabulary_size = 100    # Rounded up from 17422

# Generating Bag of Words
# Tweets have already been preprocessed hence dummy function will be passed in to preprocessor & tokenizer step
count_vector = CountVectorizer(max_features=vocabulary_size, preprocessor=lambda x: x, tokenizer=lambda x: x)
count_vector.fit(X_train)




CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=100, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function <lambda> at 0x7f86f6c680d0>,
                stop_words=None, strip_accents=None,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<function <lambda> at 0x7f86f6c68158>,
                vocabulary=None)

In [69]:
# CountVectorizer creates a vocabulary. Checking Vocabulary
print('VOCABULARY SIZE: ', len(count_vector.vocabulary_))
print('VOCABULARY CONTENT: ', count_vector.vocabulary_)


VOCABULARY SIZE:  100
VOCABULARY CONTENT:  {'pleas': 65, 'connect': 14, 'plane': 64, 'book': 9, 'problem': 66, 'travel': 85, 'due': 21, 'passeng': 61, 'make': 51, 'check': 13, 'get': 30, 'back': 6, 'delay': 19, 'flight': 27, 'would': 99, 'like': 47, 'email': 22, 'lost': 49, 'today': 82, 'fli': 26, 'even': 23, 'gate': 29, 'us': 90, 'tri': 86, 'guy': 35, 'time': 81, 'want': 94, 'custom': 17, 'servic': 74, 'could': 15, 'bag': 7, 'amp': 4, 'use': 91, 'ticket': 80, 'miss': 54, 'cancel': 11, 'day': 18, 'got': 33, 'thank': 79, 'unit': 89, 'agent': 1, 'help': 36, 'told': 83, 'go': 31, 'never': 56, 'wait': 93, 'know': 43, 'see': 73, 'home': 38, 'aa': 0, 'seat': 72, 'call': 10, 'way': 95, 'chang': 12, 'phone': 63, 'worst': 98, 'sit': 75, 'love': 50, 'board': 8, 'good': 32, 'respons': 70, 'realli': 67, 'take': 78, 'rebook': 68, 'tomorrow': 84, 'airport': 3, 'hour': 40, 'new': 57, 'airlin': 2, 'flightl': 28, 'still': 77, 'number': 59, 'reserv': 69, 'one': 60, 'trip': 87, 'issu': 42, 'crew': 16, 'h

In [70]:
# Transform the training data
X_train_bow = count_vector.transform(X_train).toarray()

# Transform the testing data
X_test_bow = count_vector.transform(X_test).toarray()


## Experiments from here on out


In [71]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


In [72]:
Ran_For = RandomForestClassifier(max_depth=2, n_estimators=100, criterion='entropy')
Ran_For.fit(X_train_bow, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [73]:
Ran_For.predict(X_test_bow[0:500])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [33]:

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 43031, 1: 53394, 2: 71051}

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train_bow, y_train)

In [None]:
neigh.score(X_test_bow, y_test)

In [None]:
neigh.predict(X_test_bow[0:10])

In [None]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)