<a href="https://colab.research.google.com/github/lijeshshetty/deep-learning-models/blob/master/TwitterSentimentAnalysis___Lijesh_Shetty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
# lets install some package
#!pip install gensim --upgrade

import time
import logging
# Word2vec
import gensim

import matplotlib.pyplot as plt
%matplotlib inline


# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

df_tweets = pd.read_csv('/kaggle/input//sentiment140/training.1600000.processed.noemoticon.csv',encoding =DATASET_ENCODING , names=DATASET_COLUMNS)
df_tweets.head()


FileNotFoundError: ignored

In [0]:
# 0 is negative or sad comments and 4 is positive.
len(df_tweets.text)

Using TweetTokenizer to tokenize the tweets. lets remove handles, and reduce length accordingly.

In [0]:
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
sentence_words =[]
for sentence in df_tweets.text:
    sentence_words.append(tknzr.tokenize(sentence))



In [0]:
sentence_words[0]

In [0]:
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

Remove the stop words and stemmer from the sentences.

In [0]:
# remove the stop words, and stemmer
sent_words =[]
for words_array_in_sentence in sentence_words:
    for word in words_array_in_sentence:
        if word in stop_words:
            words_array_in_sentence.remove(word)
    sent_words.append(words_array_in_sentence)

In [0]:
sent_words[0]

In [0]:
 # build the sentences again, and put it back in df_tweets.text, and then do train & test split using sklearn api's
new_sentences =[]
for word_array in sent_words:
   new_sentences.append(" ".join(word_array))

df_tweets.text = new_sentences

In [0]:
df_tweets.text[0]

In [0]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(df_tweets.text,df_tweets.target,test_size=0.2,random_state=42)
print(len(test_X))
print(len(train_y))

Lets make it a numpy array, and ensure it is numeric.

In [0]:
train_X = np.array(train_X)
train_y = np.array(train_y)

test_X = np.array(test_X)
test_y = np.array(test_y)

In [0]:
# now we need to download word2vec model and generate embeddings for our dataset.

import gensim

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

In [0]:
# Build the word2vec model 
# use the words to build the vocab in the word2vec model.
%time

w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)
w2v_model.build_vocab(sent_words)

In [0]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [0]:
words

In [0]:
%%time
w2v_model.train(sent_words, total_examples=len(sent_words), epochs=W2V_EPOCH)

In [0]:
# save the model
w2v_model.save('my_word2vec.bin')

In [0]:
# load the model
#loaded_model = gensim.models.word2vec.Word2Vec.load('my_word2vec.bin')

In [0]:
w2v_model.most_similar('love')

In [0]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping


In [0]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_X)

In [0]:
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

In [0]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(train_X), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_X), maxlen=SEQUENCE_LENGTH)

In [0]:
y_train = train_y.reshape(-1,1)
y_test = test_y.reshape(-1,1)
y_train.shape
x_train.shape

In [0]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [0]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], 
                            input_length=SEQUENCE_LENGTH, trainable=False)

In [0]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
#model.add(Dense(128,activation='relu',input_shape=(300,)))
#model.add(Dropout(0.2))
model.add(Dense(64,activation='relu'))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [0]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [0]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]

In [0]:
y_train.shape

In [0]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

In [0]:
%%time
score = model.evaluate(x_train, y_train, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])