In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
tweets = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/Tweets.csv')
tweets = tweets[['text', 'airline_sentiment']]
tweets.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [None]:
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

In [None]:
def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 
    
def remove_mentions(input_text):
        return re.sub(r'@\w+', '', input_text)
       
tweets.text = tweets.text.apply(remove_stopwords).apply(remove_mentions)

In [None]:
def stemList(wordList):
    stemmed = []
    for word in wordList:
        stemmedword = stemmer.stem(word)
        stemmed.append(stemmedword)
    return stemmed

def normalize(pref):
    pref = pref.lower()
    word_delimiters = u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013 ]'
    prefList = re.split(word_delimiters, pref)
    trimmed = [x.rstrip() for x in prefList]
    stemmed = stemList(trimmed)
    space = ' '
    normed = space.join(stemmed)
    normed = normed.replace('  ', ' ')
    return normed

In [None]:
tweets['normalized'] = tweets.text.apply(normalize)

In [None]:
tweets

Unnamed: 0,text,airline_sentiment,normalized
0,What said.,neutral,what said
1,plus added commercials experience... tacky.,positive,plu ad commerci experi tacki
2,today... Must mean need take another trip!,neutral,today must mean need take anoth trip
3,"really aggressive blast obnoxious ""entertainm...",negative,realli aggress blast obnoxi entertain guest f...
4,really big bad thing,negative,realli big bad thing
...,...,...,...
14635,thank got different flight Chicago.,positive,thank got differ flight chicago
14636,leaving 20 minutes Late Flight. No warnings c...,negative,leav 20 minut late flight no warn commun 15 m...
14637,Please bring American Airlines #BlackBerry10,neutral,pleas bring american airlin #blackberry10
14638,"money, change flight, answer phones! Any sugg...",negative,money chang flight answer phone ani suggest m...


In [None]:
from tensorflow.keras.preprocessing.text import one_hot

vocab_size=len(set(tweets.normalized.str.split().sum()))
tweets['encoded'] = tweets.normalized.apply(one_hot, args=[vocab_size])

In [None]:
tweets['encoded']

0                                              [494, 8899]
1                         [1722, 1546, 12518, 2587, 12862]
2              [6031, 9861, 2390, 1290, 1594, 8708, 11329]
3        [11093, 674, 10503, 2902, 4683, 4654, 3161, 11...
4                                [11093, 1310, 6434, 6741]
                               ...                        
14635                     [10858, 9277, 5447, 11638, 4584]
14636    [7409, 7492, 6092, 5155, 11638, 3438, 10084, 6...
14637                         [2349, 468, 205, 6509, 1700]
14638    [12095, 5182, 11638, 4435, 5913, 2026, 11406, ...
14639    [10770, 1290, 9087, 8165, 2090, 11932, 11638, ...
Name: encoded, Length: 14640, dtype: object

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

independent_vars = pad_sequences(tweets.encoded.values)
independent_vars

array([[    0,     0,     0, ...,     0,   494,  8899],
       [    0,     0,     0, ..., 12518,  2587, 12862],
       [    0,     0,     0, ...,  1594,  8708, 11329],
       ...,
       [    0,     0,     0, ...,   205,  6509,  1700],
       [    0,     0,     0, ..., 11406, 10089,  7902],
       [    0,     0,     0, ...,  3759, 11932, 11638]], dtype=int32)

In [None]:
independent_vars.shape

(14640, 27)

In [None]:
tweets.head()

Unnamed: 0,text,airline_sentiment,normalized,encoded
0,What said.,neutral,what said,"[494, 8899]"
1,plus added commercials experience... tacky.,positive,plu ad commerci experi tacki,"[1722, 1546, 12518, 2587, 12862]"
2,today... Must mean need take another trip!,neutral,today must mean need take anoth trip,"[6031, 9861, 2390, 1290, 1594, 8708, 11329]"
3,"really aggressive blast obnoxious ""entertainm...",negative,realli aggress blast obnoxi entertain guest f...,"[11093, 674, 10503, 2902, 4683, 4654, 3161, 11..."
4,really big bad thing,negative,realli big bad thing,"[11093, 1310, 6434, 6741]"


In [None]:
from tensorflow.keras.utils import to_categorical
target = pd.get_dummies(tweets.airline_sentiment)
# target = to_categorical(tweets.airline_sentiment)

In [None]:
target

Unnamed: 0,negative,neutral,positive
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
14635,0,0,1
14636,1,0,0
14637,0,1,0
14638,1,0,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(independent_vars, target, test_size=0.2, random_state=21)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU

In [None]:
target.shape

(14640, 3)

In [None]:
max_words = np.max(X_train)+1
num_class = target.shape[1]
max_len = X_train.shape[1]

In [None]:
model = Sequential()

model.add(Embedding(max_words, 100, input_length=max_len))
model.add(LSTM(256))
model.add(Dense(512, activation='relu'))
model.add(Dense(num_class, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 27, 100)           1304400   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               365568    
_________________________________________________________________
dense (Dense)                (None, 512)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 1539      
Total params: 1,803,091
Trainable params: 1,803,091
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

epochs = 50
batch_size = 128

model.fit(X_train, y_train, validation_data=(X_test, y_test),
          batch_size=batch_size, epochs=epochs)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7efcb8eff748>