In [2]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re


In [3]:
df=pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv',encoding = 'latin', header = None)
df.head()

In [5]:
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
df.head(10)


In [6]:
df = df.drop(['id', 'date', 'query', 'user_id'], axis =1)
df.head()


In [8]:
to_sentiment={0:'Negative', 4:'Positive'}
def label_decoder(label):
    return to_sentiment[label]
df.sentiment=df.sentiment.apply(lambda x:label_decoder(x))
df.head()


In [11]:
import seaborn as sns
plt.figure(dpi=100)
sns.countplot(df['sentiment'])

In [15]:
stop_words=stopwords.words('english')
stemmer=SnowballStemmer('english')
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

In [19]:
#text preprocessing
def preprocess(text, stem=False):
    text=re.sub(text_cleaning_re,' ',str(text).lower()).strip()
    tokens=[]
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return ' '.join(tokens)


In [21]:
df.text = df.text.apply(lambda x : preprocess(x))
df.head()


In [22]:
#split the data
TRAIN_SIZE=0.8
MAX_NB_WORDS=100000
MAX_SEQ_LENGTH = 30

In [25]:
train_data, test_data = train_test_split(df, test_size = 1-TRAIN_SIZE, random_state = 5)
print('Size of training data', len(train_data))
print('Size of testing data', len(test_data))

In [26]:
train_data.head(20)

In [27]:
#tokenization
from keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_data.text)

word_index=tokenizer.word_index
vocab_size=len(word_index)
print('VOCAB SIZE',vocab_size)

In [32]:
#padding and sequencing
from keras.preprocessing.sequence import pad_sequences
x_train=pad_sequences(tokenizer.texts_to_sequences(train_data.text),maxlen=MAX_SEQ_LENGTH)
x_test=pad_sequences(tokenizer.texts_to_sequences(test_data.text),maxlen=MAX_SEQ_LENGTH)

print('Training X shape',x_train.shape)
print('Testing X shape',x_test.shape)

In [35]:
encoder=LabelEncoder()
encoder.fit(train_data.sentiment.to_list())

y_train=encoder.transform(train_data.sentiment.tolist())
y_test=encoder.transform(test_data.sentiment.tolist())

print(y_train.shape)
print(y_test.shape)

In [39]:
y_train=y_train.reshape(-1,1)
y_test=y_test.reshape(-1,1)

print(y_train.shape)
print(y_test.shape)

In [40]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


In [41]:
GLOVE_EMB = './glove.6B.300d.txt'
EMBEDDING_DIM = 300
LR = 1e-3
BATCH_SIZE = 1024
EPOCHS = 15
MODEL_PATH = '.../output/kaggle/working/best_model.hdf5'


In [44]:
embeddings_index={}
f=open(GLOVE_EMB)
for line in f:
    values=line.split()
    word=value=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embeddings_index[word]=coefs
f.close()

print("Found {} word vectors".format(len(embeddings_index)))

In [45]:
s = 'the 4.6560e-02  2.1318e-01 -7.4364e-03 -4.5854e-01 -3.5639e-02  2.3643e-01 -2.8836e-01  2.1521e-01 -1.3486e-01 -1.6413e+00 -2.6091e-01  3.2434e-02'
l = s.split()
word = l[0]
values = l[1:]
d1 = {}
print(word)
print(values)
d1[word] = values
d1


In [46]:
count=1
for word, embedding in embeddings_index.items():
    if count > 5: break
    print(word, embedding,len(embedding),sep = ' : ' ,end = '\n\n\n')
    count+=1


In [47]:
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector


In [48]:
embedding_layer = tf.keras.layers.Embedding(vocab_size,
                                           EMBEDDING_DIM,
                                           weights = [embedding_matrix],
                                           input_length = MAX_SEQ_LENGTH,
                                           trainable = False)


In [49]:
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint


In [50]:
sequence_input = Input(shape = (MAX_SEQ_LENGTH,), dtype = 'int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5 , activation = 'relu')(x)
x = Bidirectional(LSTM(64, dropout = 0.2, recurrent_dropout = 0.2))(x)
x = Dense(512,activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation = 'relu')(x)
outputs = Dense(1, activation = 'sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)


In [51]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau

model.compile(optimizer = Adam(learning_rate = LR), 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

LR_reduction = ReduceLROnPlateau(factor = 0.1,
                                 min_lr = 0.0001,
                                 monitor = 'val_loss',
                                 verbose = 1)


In [52]:
history = model.fit(x_train,
                   y_train,
                   batch_size = BATCH_SIZE,
                   epochs = EPOCHS,
                   validation_data = (x_test,y_test),
                   callbacks = [LR_reduction])


In [55]:
s, (at, al) = plt.subplots(2,1)
at.plot(history.history['accuracy'], c= 'b')
at.plot(history.history['val_accuracy'], c='r')
at.set_title('model accuracy')
at.set_ylabel('accuracy')
at.set_xlabel('epoch')
at.legend(['LSTM_train', 'LSTM_val'], loc='upper left')

al.plot(history.history['loss'], c='m')
al.plot(history.history['val_loss'], c='c')
al.set_title('model loss')
al.set_ylabel('loss')
al.set_xlabel('epoch')
al.legend(['train', 'val'], loc = 'upper left')


In [62]:
import itertools
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=20)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=13)
    plt.yticks(tick_marks, classes, fontsize=13)
    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=17)
    plt.xlabel('Predicted label', fontsize=17)




In [65]:
def decode_sentiment(score):
    return "Positive" if score>0.5 else "Negative"
scores = model.predict(x_test, verbose=1, batch_size=10000)
y_pred_1d = [decode_sentiment(score) for score in scores]


In [66]:
cnf_matrix = confusion_matrix(test_data.sentiment.to_list(), y_pred_1d)
plt.figure(figsize=(6,6))
plot_confusion_matrix(cnf_matrix, classes=test_data.sentiment.unique(), title="Confusion matrix")
plt.show()



In [67]:
print(classification_report(list(test_data.sentiment), y_pred_1d))
