# Sentiment Analysis - NLP
Content:\
target: the polarity of the tweet (0 = negative, 4 = positive)\
ids: The id of the tweet ( 2087)\
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)\
flag: The query (lyx). If there is no query, then this value is NO_QUERY.\
user: the user that tweeted (robotickilldozr)\
text: the text of the tweet (Lyx is cool)

In [48]:
#import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import matplotlib as plt
%matplotlib inline
import re, time
import gensim
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kparekh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
#import data
data = pd.read_csv('C:/Users/kparekh/Downloads/training.1600000.processed.noemoticon.csv', header= None, names=['target', 'ids', 'date', 'flag', 'user', 'text'], encoding='latin-1')
data = data.sample(n=100000, random_state = 123)
data.head()

Unnamed: 0,target,ids,date,flag,user,text
448282,0,2068921155,Sun Jun 07 14:56:42 PDT 2009,NO_QUERY,smiley_sophie,my arm still hurts from when i pulled it yeste...
1475261,4,2065871668,Sun Jun 07 09:27:21 PDT 2009,NO_QUERY,ImmaChocoholic,I have so much to do outside! Been looking at ...
132529,0,1835774749,Mon May 18 06:43:27 PDT 2009,NO_QUERY,drmomentum,"@AbsolutSara Yes, I knew about the clusterfark..."
182348,0,1967121891,Fri May 29 19:00:46 PDT 2009,NO_QUERY,sweetsheilx,Just woke up and i feel relieved Haha now i ha...
907614,4,1695846172,Mon May 04 07:04:29 PDT 2009,NO_QUERY,monmariej,LOVING the hot weather forecast for the rest o...


In [32]:
#drop columns we don't need
data = data.drop(['ids', 'date', 'flag', 'user'], 1)
data.shape

(100000, 2)

In [33]:
data['length'] = data.text.str.len()
data.length.describe().round()

count    100000.0
mean         74.0
std          36.0
min           7.0
25%          44.0
50%          69.0
75%         104.0
max         270.0
Name: length, dtype: float64

In [34]:
data[data.target==0].text.head()

448282    my arm still hurts from when i pulled it yeste...
132529    @AbsolutSara Yes, I knew about the clusterfark...
182348    Just woke up and i feel relieved Haha now i ha...
12387     morning folks... here i am bored at work again...
279890    @princessofmars I am lost. Please help me find...
Name: text, dtype: object

In [35]:
data[data.target==4].text.head()

1475261    I have so much to do outside! Been looking at ...
907614     LOVING the hot weather forecast for the rest o...
1338189    Having a productive morning - then a 90-minute...
926369     @JimFoss Love it, used to do the same thing at...
1302919    leaving the nest while i prepare din-din. twee...
Name: text, dtype: object

In [36]:
data.target.value_counts(normalize=True)

0    0.50078
4    0.49922
Name: target, dtype: float64

In [37]:
#pre-process text
stop_words = stopwords.words("english")
def preprocess(text):
    text = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', str(text).lower().strip())
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(token)
    return " ".join(tokens)

In [38]:
%%time
data['text_cleaned'] = data.text.apply(preprocess)

Wall time: 2.37 s


In [39]:
#split data into train and test
data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)
print("Train size:", len(data_train))
print("Test size:", len(data_test))

Train size: 80000
Test size: 20000


Because we have enough volume of data, we can attempt a neural network with word2vec embeddings - words that are related to each other are mapped to points that are closer to each other in a high dimensional space. 

### Word2Vec

In [40]:
#train word2vec embeddings
text_tokens = [text.split() for text in data_train.text_cleaned]
text_tokens

[['oh', 'hollys', 'baby', 'shower', 'actually', 'fun'],
 ['excited', 'see', 'samantha', 'amp', 'denise'],
 ['reality', 'sinking'],
 ['really',
  'craving',
  'cold',
  'fresh',
  'juicy',
  'mixture',
  'fruit',
  'right',
  'abt',
  'still',
  'traumatized',
  'rotten',
  'watermelon'],
 ['im', 'proud', 'well', 'done', 'glomp'],
 ['everyone'],
 ['shall',
  'sad',
  'leave',
  'uncle',
  'cats',
  'though',
  'insha',
  'allah',
  'cute'],
 ['good',
  'morning',
  'wonderful',
  'day',
  'r',
  'feeling',
  'remember',
  'come',
  'season',
  'thankful',
  'amp',
  'smile'],
 ['attempting', 'stat', '200', 'stat', '480', 'homework'],
 ['phoneless', 'next', 'couple', 'hours', 'one', 'blame'],
 ['let',
  'see',
  'done',
  'postage',
  'malaysia',
  'phil',
  'may',
  'cheap',
  'u',
  'gather',
  'number',
  'orders'],
 ['lkhad',
  'first',
  'starbucks',
  'today',
  'caramel',
  'frappuccino',
  'absolutely',
  'loved',
  'letterkenny',
  'needs',
  'get',
  'starbucks',
  'closest',
 

In [41]:
%%time
w2v_model = gensim.models.Word2Vec(text_tokens, size=100, window = 5, seed = 1, min_count = 5, workers = 8)
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 9217
Wall time: 15min 30s


In [42]:
w2v_model.most_similar("love")

  """Entry point for launching an IPython kernel.


[('awesome', 0.8805774450302124),
 ('plz', 0.8576584458351135),
 ('rock', 0.8539444208145142),
 ('miley', 0.8483649492263794),
 ('cute', 0.8483544588088989),
 ('amazing', 0.8472157716751099),
 ('luv', 0.8441596031188965),
 ('tom', 0.8394282460212708),
 ('loved', 0.8328625559806824),
 ('w8', 0.832410454750061)]

In [43]:
#tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_train.text_cleaned)
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 50405


In [44]:
#pad sequences to a specified length
x_train = pad_sequences(tokenizer.texts_to_sequences(data_train.text), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(data_test.text), maxlen=300)

In [49]:
#encode target
labels = data_train.target.unique().tolist()
encoder = LabelEncoder()
encoder.fit(data_train.target.tolist())

y_train = encoder.transform(data_train.target.tolist())
y_test = encoder.transform(data_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (80000, 300)
y_train (80000, 1)
x_test (20000, 300)
y_test (20000, 1)


In [51]:
#prepare an embedding layer to pass into the neural network
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(50405, 100)


In [53]:
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=300, trainable=False)

In [55]:
#build model
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 300, 100)          5040500   
_________________________________________________________________
dropout_2 (Dropout)          (None, 300, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 5,121,001
Trainable params: 80,501
Non-trainable params: 5,040,500
_________________________________________________________________


In [57]:
#compile model
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [58]:
#define callbacks for learning rate and early stopping
callbacks = [ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [60]:
#fit model - for now we will only train on one epoch/ the model should be trained for longer for optimal performance
history = model.fit(x_train, y_train,
                    batch_size=512,
                    epochs=1,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Train on 72000 samples, validate on 8000 samples
Epoch 1/1


In [63]:
#evaluate performance on test set
score = model.evaluate(x_test, y_test, batch_size=512)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])


ACCURACY: 0.66905
LOSS: 0.598604208946228


In [75]:
SENTIMENT_THRESHOLDS = (0.4, 0.7)
def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = 'NEUTRAL'
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = 'NEGATIVE'
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = 'POSITIVE'

        return label
    else:
        return 'NEGATIVE' if score < 0.5 else 'POSITIVE'

In [76]:
def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=True)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}  

In [77]:
predict("I love cake")

{'label': 'POSITIVE',
 'score': 0.7171138525009155,
 'elapsed_time': 0.03290843963623047}

In [79]:
predict("I don't know anything")

{'label': 'NEGATIVE',
 'score': 0.3009055256843567,
 'elapsed_time': 0.026886701583862305}

**The above performance shows pretty good results with little data compared to what's required for neural networks. The performance can definetely be improved with more data, longer training, and optimized hyperparameters.**