In [1]:
# reference: https://towardsdatascience.com/an-easy-tutorial-about-sentiment-analysis-with-deep-learning-and-keras-2bf52b9cba91

In [26]:
import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import tensorflow as tf

In [16]:
from utilities.preprocessing import load_modeling_data

In [35]:
X_train = pd.read_csv('data/train_data.csv')
y_train = pd.read_csv('data/train_results.csv')

In [36]:
def depure_data(data):
    
    #Removing URLs with a regular expression
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)

    # Remove new line characters
    data = re.sub('\s+', ' ', data)

    # Remove distracting single quotes
    data = re.sub("\'", "", data)
        
    return data

In [37]:
temp = []
#Splitting pd.Series to list
data_to_list = X_train['text'].values.tolist()
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))
list(temp[:5])

['Anyway Im getting of for a while ',
 'My red, Apache isnt feelin too well this morning.. ',
 'you should be its great. friday will be great tooooooo ))))',
 'its 11:30pm and i dont wanna sleep; so i debated with myself, and in the end i decided what a perfect time to BAKE! no kidding. ',
 'Why does twitter eat my DMs? Not happy ']

In [38]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        

data_words = list(sent_to_words(temp))

print(data_words[:10])

[['anyway', 'im', 'getting', 'of', 'for', 'while'], ['my', 'red', 'apache', 'isnt', 'feelin', 'too', 'well', 'this', 'morning'], ['you', 'should', 'be', 'its', 'great', 'friday', 'will', 'be', 'great', 'tooooooo'], ['its', 'pm', 'and', 'dont', 'wanna', 'sleep', 'so', 'debated', 'with', 'myself', 'and', 'in', 'the', 'end', 'decided', 'what', 'perfect', 'time', 'to', 'bake', 'no', 'kidding'], ['why', 'does', 'twitter', 'eat', 'my', 'dms', 'not', 'happy'], ['hey', 'there', 'drivin', 'north', 'guess', 'we', 'will', 'miss', 'tonite'], ['is', 'making', 'cheese', 'today', 'in', 'biology'], ['cant', 'sleep', 'its', 'already', 'am'], ['what', 'rainy', 'gloomy', 'week', 'cant', 'even', 'get', 'into', 'our', 'new', 'pool'], ['some', 'bitch', 'stole', 'my', 'blackberry', 'the', 'other', 'night', 'in', 'santa', 'monica', 'still', 'pissed', 'why', 'she', 'gotta', 'take', 'ma', 'baby', 'away']]


In [39]:
len(data_words)


1040323

In [40]:
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

In [41]:
data = []
for i in range(len(data_words)):
    data.append(detokenize(data_words[i]))
print(data[:5])


['anyway im getting of for while', 'my red apache isnt feelin too well this morning', 'you should be its great friday will be great tooooooo', 'its pm and dont wanna sleep so debated with myself and in the end decided what perfect time to bake no kidding', 'why does twitter eat my dms not happy']


In [42]:
data = np.array(data)


In [43]:
labels = y_train['target'].values
y = []
for i in range(len(labels)):
    if labels[i] == 'neutral':
        y.append(0)
    if labels[i] == 'negative':
        y.append(1)
    if labels[i] == 'positive':
        y.append(2)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 3, dtype="float32")
del y

In [44]:
len(labels)


1040323

In [45]:
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop,Adam
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
tweets = pad_sequences(sequences, maxlen=max_len)
print(tweets)

[[  0   0   0 ...  10   9 248]
 [  0   0   0 ...  63  25  89]
 [  0   0   0 ...  55  20  93]
 ...
 [  0   0   0 ...  65  43 599]
 [  0   0   0 ...  32 586  48]
 [  0   0   0 ...   0   0 106]]


In [46]:
print(labels)


[[0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 ...
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]]


In [49]:
from sklearn.model_selection import train_test_split

In [50]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(tweets,labels, random_state=0)
print (len(X_train),len(X_test),len(y_train),len(y_test))

780242 260081 780242 260081


In [51]:
model1 = Sequential()
model1.add(layers.Embedding(max_words, 20))
model1.add(layers.LSTM(15,dropout=0.5))
model1.add(layers.Dense(3,activation='softmax'))


model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint1])

Metal device set to: Apple M1


2022-11-26 22:45:06.014881: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-11-26 22:45:06.015738: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/70


2022-11-26 22:45:06.457515: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-11-26 22:45:07.012920: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-26 22:45:07.343406: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2022-11-26 22:45:09.035393: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




KeyboardInterrupt: 

In [None]:
model2 = Sequential()
model2.add(layers.Embedding(max_words, 40, input_length=max_len))
model2.add(layers.Bidirectional(layers.LSTM(20,dropout=0.6)))
model2.add(layers.Dense(3,activation='softmax'))
model2.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint2 = ModelCheckpoint("best_model2.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint2])


In [None]:
from keras import regularizers
model3 = Sequential()
model3.add(layers.Embedding(max_words, 40, input_length=max_len))
model3.add(layers.Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
model3.add(layers.MaxPooling1D(5))
model3.add(layers.Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
model3.add(layers.GlobalMaxPooling1D())
model3.add(layers.Dense(3,activation='softmax'))
model3.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['acc'])
checkpoint3 = ModelCheckpoint("best_model3.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model3.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint3])

In [None]:
best_model = keras.models.load_model("best_model2.hdf5")

In [None]:
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

In [None]:
predictions = best_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test.argmax(axis=1), np.around(predictions, decimals=0).argmax(axis=1))


In [None]:
import seaborn as sns
conf_matrix = pd.DataFrame(matrix, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Negative','Positive'])
#Normalizing
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (15,15))
sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 15})