Configurations:
* install tensorflow 2.1
* install matplotlib
* install pandas
* install scjkit-learn
* install nltk

In [13]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import re

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

from keras.layers.embeddings import Embedding
from keras.layers.core import SpatialDropout1D
from keras.layers import LSTM
from keras.callbacks import EarlyStopping

from numpy.random import seed

In [15]:
#Load Data
df_train = pd.read_csv('../data/deep-learning-datasets/twitter-sentiment-analysis/train_E6oV3lV.csv')
df_train.columns = ["id", "label", "text"]

df_test = pd.read_csv('../data/deep-learning-datasets/twitter-sentiment-analysis/test_tweets_anuFYb8.csv')
df_test.columns = ["id","text"]
df_train

Unnamed: 0,id,label,text
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [18]:
# clean data
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def clean_text(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    text = ' '.join(word for word in text.split() if len(word) > 2) # remove stopwors from text
    return text

def preprocess_text(df):
    df = df.reset_index(drop=True)
    df['text'] = df['text'].apply(clean_text)
    df['text'] = df['text'].str.replace('\d+', '')
    return df

df_train = preprocess_text(df_train)
df_test = preprocess_text(df_test)

df_train

Unnamed: 0,id,label,text
0,1,0,user when father dysfunctional and selfish dra...
1,2,0,user user thanks for #lyft credit cant use cau...
2,3,0,bihday your majesty
3,4,0,#model love take with all the time
4,5,0,factsguide society now #motivation
...,...,...,...
31957,31958,0,ate user isz that youuu
31958,31959,0,see nina turner the airwaves trying wrap herse...
31959,31960,0,listening sad songs monday morning otw work sad
31960,31961,1,user #sikh #temple vandalised #calgary #wso co...


In [19]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 30000

# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 50

# This is fixed.
EMBEDDING_DIM = 100
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS, filters='#!"$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split= ' ')

tokenizer.fit_on_texts((df_train['text'].append(df_test['text'])).values)
word_index = tokenizer.word_index
word_index['study']


1659

In [22]:
def fromTextToFeatures(df_text):
    # gives you a list of integer sequences encoding the words in your sentence
    X = tokenizer.texts_to_sequences(df_text.values)
    # split the X 1-dimensional sequence of word indexes into a 2-d listof items
    # Each item is split is a sequence of 50 value left-padded with zeros
    X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
    return X
X = fromTextToFeatures(df_train['text'])
print('Shape of data tensor:', X.shape)
#X

array([[    0,     0,     0, ...,    75, 10010,   427],
       [    0,     0,     0, ...,  5105, 20034, 12887],
       [    0,     0,     0, ...,    36,    13,  2909],
       ...,
       [    0,     0,     0, ...,  7997,    54,    90],
       [    0,     0,     0, ...,  2005,  1802,   662],
       [    0,     0,     0, ...,     5,     3,   132]])

In [23]:
X_test_ex = fromTextToFeatures(df_test['text'])

print('Shape of data tensor:', X_test_ex.shape)

Shape of data tensor: (17197, 50)


In [26]:
Y = pd.get_dummies(df_train['label']).values
# asdas dasda sd asd asd asd  [0, 1]
# dfsdf asd  sd fdsf sdf [1, 0]

print('Shape of label tensor:', Y.shape)

Shape of label tensor: (31962, 2)


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(28765, 50) (28765, 2)
(3197, 50) (3197, 2)


In [28]:
seed(100)

model = Sequential()
# The Embedding layer is used to create word vectors for incoming words. 
# It sits between the input and the LSTM layer, i.e. 
# the output of the Embedding layer is the input to the LSTM layer.
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 3
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Train on 25888 samples, validate on 2877 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [29]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

pred_y = model.predict(X_test)


Test set
  Loss: 0.127
  Accuracy: 0.962
