# Tweet sentiment analysis with deep learning models

In this notebook we will be testing several deep learning model configurations to find the best performing one on our set of 40.000 positive and 40.000 negative tweets.

We will be attempting to guess positive (1) or negative (0) sentiment in a tweet.

In [None]:
# install necessary librairies

!pip install emoji
!pip install nltk
!pip install tweet-preprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 13.9 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.1.0-py3-none-any.whl size=212392 sha256=e011892c448ac71618de47efe5f0e945d1275c90f9ab8dfb6f2ebfab55e124ca
  Stored in directory: /root/.cache/pip/wheels/77/75/99/51c2a119f4cfd3af7b49cc57e4f737bed7e40b348a85d82804
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected pack

In [None]:
# import necessary librairies

import numpy as np
import pandas as pd
import nltk
import emoji
import preprocessor as p

from sklearn.model_selection import train_test_split

import time
import pickle

from keras.utils import pad_sequences
from keras.layers import Flatten, LSTM, Embedding, Dense, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import Sequential

# download important nltk packages

nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
nltk.download('omw-1.4')

# save requirements

pip freeze > requirements.txt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
english_words = set(nltk.corpus.words.words())
lem = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.RegexpTokenizer(r'[a-zA-Z]+')

def text_cleaner(text):

  text = emoji.demojize(text, delimiters=("", "")) # demojize the emojis in the docs

  text = text.lower() # to lowercase
    
  text = tokenizer.tokenize(text) # tokenize with regular expressions

  text = [w for w in text if w not in stop_words] # remove stopwords

  text = [w for w in text if w in english_words] # keep only english words

  text = [lem.lemmatize(w) for w in text] # lemmatize

  text = [w for w in text if len(w) > 2] # keep only words longer than 2 characters

  return text

In [None]:
embeddings_dictionary_w2vec = dict()
w2vec_file = open('/content/drive/MyDrive/Colab Notebooks/enwiki_20180420_100d.txt', encoding='utf-8')

for line in w2vec_file:
    records = line.strip().split(' ')
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary_w2vec[word] = vector_dimensions
w2vec_file.close()

In [None]:
embeddings_dictionary_glove = dict()
glove_file = open('/content/drive/MyDrive/Colab Notebooks/glove.twitter.27B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary_glove[word] = vector_dimensions
glove_file.close()

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/katrinmisel/sentiment_analysis/master/sample_df.csv")

In [None]:
def deep_model_test(data, text_prep_function, embedding, model_type, output_filename):

  print(output_filename)
  performances = []

  # split in X and y (target)

  if (text_prep_function == 'homemade'): # if the model uses our handmade function
    X = np.array(data.tweet.apply(lambda x: text_cleaner(x)))
  elif (text_prep_function == 'tweet_preprocessor'): # if the model uses the tweet preprocessor python library
    X = np.array(data.tweet.apply(lambda x: p.clean(x)))
  else:
    X = np.array(data.tweet) #  no preprocessing at all

  y = np.array(data.target)

  # train test split with 30% test size

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  # tokenize with keras

  keras_tokenizer = Tokenizer(num_words=5000)
  keras_tokenizer.fit_on_texts(X_train)

  # transform texts to sequences and pad sequences to the same length (100)

  X_train = keras_tokenizer.texts_to_sequences(X_train)
  X_test = keras_tokenizer.texts_to_sequences(X_test)

  X_train = pad_sequences(X_train, padding='post', maxlen=100)
  X_test = pad_sequences(X_test, padding='post', maxlen=100)

  vocab_size = len(keras_tokenizer.word_index) + 1

  if (embedding=='glove'): # create a GloVe embedding matrix

    embedding_matrix = np.zeros((vocab_size, 100))
    for word, index in keras_tokenizer.word_index.items():
        embedding_vector = embeddings_dictionary_glove.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
  
  else: # create a Wiki2Vec embedding matrix

    embedding_matrix = np.zeros((vocab_size, 100))
    for word, index in keras_tokenizer.word_index.items():
        embedding_vector = embeddings_dictionary_w2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

  # both our models are sequential keras models with a first embedding layer with the embedding matrix we created as weights

  model = Sequential()
  embedding_layer = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=100, trainable=False)
  model.add(embedding_layer)

  # if model is simple, add a Flatten and a Dense layer

  if (model_type == 'simple'):
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  # else if model is advanced, add a bidirectional LSTM layer and a Dense layer

  else:
    # !!! since we are using dropout, our training accuracy will be lower than our validation accuracy
    model.add(Bidirectional(LSTM(64, dropout=0.5, recurrent_dropout=0.5))) 
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  # use callbacks: earlystopping and modelcheckpoint that saves our model at its best performance

  es = EarlyStopping(monitor='val_accuracy', patience=3, verbose=0)
  mc = ModelCheckpoint(output_filename, monitor='val_accuracy', patience=3, verbose=0)

  # fit our model and time the training

  start = time.time()
  history = model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.3, callbacks=[es, mc])
  end = time.time()

  # evaluate our model on the validation set and record performances

  score = model.evaluate(X_test, y_test, verbose=0)
  loss = score[0]
  accuracy = score[1]
  training_time = end - start

  performances.append(output_filename)
  performances.append(loss)
  performances.append(accuracy)
  performances.append(training_time)

  return performances # output a list with the name of the model, the validation loss, the validation accuracy and the training time

We will check the performance of several models with our function.


*   Simple neural network vs. bidirectional LSTM
*   GloVe vs. Wiki2Vec encoding
*   Tweet preprocessor library vs. our own text cleaning function

In [None]:
perf_simple_homemade_wiki2vec = deep_model_test(data=df, text_prep_function='homemade', embedding='wiki2vec', model_type='simple', output_filename='perf_simple_homemade_wiki2vec.h5')
perf_simple_homemade_glove = deep_model_test(data=df, text_prep_function='homemade', embedding='glove', model_type='simple', output_filename='perf_simple_homemade_glove.h5')
perf_simple_tweetprep_wiki2vec = deep_model_test(data=df, text_prep_function='tweet_preprocessor', embedding='wiki2vec', model_type='simple', output_filename='perf_simple_tweetprep_wiki2vec.h5')
perf_simple_tweetprep_glove = deep_model_test(data=df, text_prep_function='tweet_preprocessor', embedding='glove', model_type='simple', output_filename='perf_simple_tweetprep_glove.h5')

perf_advanced_homemade_wiki2vec = deep_model_test(data=df, text_prep_function='homemade', embedding='wiki2vec', model_type='advanced', output_filename='perf_advanced_homemade_wiki2vec.h5')
perf_advanced_homemade_glove = deep_model_test(data=df, text_prep_function='homemade', embedding='glove', model_type='advanced', output_filename='perf_advanced_homemade_glove.h5')
perf_advanced_tweetprep_wiki2vec = deep_model_test(data=df, text_prep_function='tweet_preprocessor', embedding='wiki2vec', model_type='advanced', output_filename='perf_advanced_tweetprep_wiki2vec.h5')
perf_advanced_tweetprep_glove = deep_model_test(data=df, text_prep_function='tweet_preprocessor', embedding='glove', model_type='advanced', output_filename='perf_advanced_tweetprep_glove.h5')

perf_simple_homemade_wiki2vec.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
perf_simple_homemade_glove.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
perf_simple_tweetprep_wiki2vec.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
perf_simple_tweetprep_glove.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
perf_advanced_homemade_wiki2vec.h5




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
perf_advanced_homemade_glove.h5




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
perf_advanced_tweetprep_wiki2vec.h5




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
perf_advanced_tweetprep_glove.h5




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Since we see that GloVe embedding works better than Wiki2Vec and the tweet preprocessor works better than the function we wrote, let's see if not cleaning the tweets at all could actually help our model.We test therefore with GloVe embedding on our simple and LSTM model.

In [None]:
perf_simple_noclean_glove = deep_model_test(data=df, text_prep_function='none', embedding='glove', model_type='simple', output_filename='perf_simple_noclean_glove.h5')
perf_advanced_noclean_glove = deep_model_test(data=df, text_prep_function='none', embedding='glove', model_type='advanced', output_filename='perf_advanced_noclean_glove.h5')

perf_simple_noclean_glove.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
perf_advanced_noclean_glove.h5




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_perfs = [perf_simple_homemade_wiki2vec,
               perf_simple_homemade_glove,
               perf_simple_tweetprep_wiki2vec,
               perf_simple_tweetprep_glove,
               perf_simple_noclean_glove,
               perf_advanced_homemade_wiki2vec,
               perf_advanced_homemade_glove,
               perf_advanced_tweetprep_wiki2vec,
               perf_advanced_tweetprep_glove,
               perf_advanced_noclean_glove]

In [None]:
model_comparison = pd.DataFrame(columns=['Name', 'Loss', 'Accuracy', 'Training time'], index=range(0,len(model_perfs)))

for model in model_perfs:
  i = model_perfs.index(model)
  model_comparison['Name'][i] = model[0]
  model_comparison['Loss'][i] = model[1]
  model_comparison['Accuracy'][i] = model[2]
  model_comparison['Training time'][i] = model[3]

model_comparison

Unnamed: 0,Name,Loss,Accuracy,Training time
0,perf_simple_homemade_wiki2vec.h5,0.624562,0.643833,10.64547
1,perf_simple_homemade_glove.h5,0.579164,0.69425,10.884079
2,perf_simple_tweetprep_wiki2vec.h5,0.60659,0.674917,6.712278
3,perf_simple_tweetprep_glove.h5,0.557438,0.718833,5.305058
4,perf_simple_noclean_glove.h5,0.568705,0.71325,10.646015
5,perf_advanced_homemade_wiki2vec.h5,0.582028,0.682833,1107.097661
6,perf_advanced_homemade_glove.h5,0.539268,0.71925,1106.423211
7,perf_advanced_tweetprep_wiki2vec.h5,0.550751,0.721667,1106.755139
8,perf_advanced_tweetprep_glove.h5,0.484914,0.761417,1124.828516
9,perf_advanced_noclean_glove.h5,0.489464,0.760417,1287.484096


In [None]:
model_comparison.to_csv('model_comparison.csv')