# ELMo for Sentiment Analysis

# Imports

In [9]:
import numpy as np
import pandas as pd
import string
import re
import keras
import nltk
import time
import tensorflow_hub as hub
import tensorflow as tf
import pickle

from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Activation
from tensorflow.keras import activations

from sklearn import metrics
from sklearn.linear_model import LogisticRegression

from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split

# Read files
Only execute one of these

## Apple Sentiment

In [10]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

## US Airline Sentiment

In [None]:
data = pd.read_csv("data/sentiment/Tweets.csv")

data = data[['text', 'airline_sentiment']]
data.rename(columns={'airline_sentiment' : 'sentiment'}, inplace=True)

thisdict =	{
  "negative": -1,
  "neutral": 0,
  "positive": 1
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])


## T4SA

In [14]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv("data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]

thisdict =	{
  "NEG": -1,
  "NEU": 0,
  "POS": 1
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])



General Text Cleaning

In [15]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
df_train

Unnamed: 0,text,sentiment
0,for all who served and fought for our country ...,1
1,rt florida teen only fourth person in last ...,0
2,wireless digital lcd color baby monitor camera...,0
3,found another one and i hope this isnt coming ...,-1
4,happy thanksgiving im thankful you all cant draw,1
...,...,...
790566,rt ok but can we talk about how kelsi from hi...,0
790567,rt and this time with link thanks writing...,1
790568,living for music focus on europe news news ...,0
790569,world of final fantasy cinematic anime openin...,0


# Word Embeddings

In [17]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Only one vector for an input sequence

In [18]:
@tf.function
def elmo_vectors(x):
  embeddings = elmo.signatures["default"](x)["elmo"]

  # return average of ELMo features
  return tf.reduce_mean(embeddings,1)

Create Batches

In [19]:
list_train = [df_train.text[i:i+100] for i in range(0,df_train.shape[0],100)]
list_test = [df_test.text[i:i+100] for i in range(0,df_test.shape[0],100)]


In [20]:
len(list_train)

7906

In [21]:
start = time.time()
elmo_train = [elmo_vectors(tf.constant(x.tolist())) for x in list_train]
elmo_test = [elmo_vectors(tf.constant(x.tolist())) for x in list_test]

print("Time elapsed: ", (time.time() - start) / 60)

Time elapsed:  34.75061337550481


In [22]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [23]:
elmo_train_new.shape

(790571, 1024)

Save Word vectors

In [None]:
# save elmo_train_new
pickle_out = open("data/elmo_train_01092020_1.pickle","wb")
pickle.dump(elmo_train_new[:len(elmo_train_new) // 2], pickle_out)
pickle_out.close()
pickle_out = open("data/elmo_train_01092020_2.pickle","wb")
pickle.dump(elmo_train_new[len(elmo_train_new) // 2 + 1 :], pickle_out)
pickle_out.close()

# save elmo_test_new (too big, cut in smaller pieces)
pickle_out = open("data/elmo_test_30082020.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

Load word vectors

In [None]:
# load elmo_train_new
pickle_in = open("data/elmo_train_29082020.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("data/elmo_test_29082020.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

# Logistic Regression

In [25]:
lreg = LogisticRegression(max_iter=1000)
lreg.fit(elmo_train_new, df_train.sentiment)

LogisticRegression(max_iter=1000)

## Evaluation

In [26]:
start = time.time()
preds_test = lreg.predict(elmo_test_new)
print("Time elapsed: ", (time.time() - start) / 60)

Time elapsed:  0.02665832042694092


In [28]:

print(metrics.f1_score(df_test.sentiment, preds_test, average=None))
print(metrics.accuracy_score(df_test.sentiment, preds_test))

[0.8549808  0.95028347 0.92538152]
0.9283564380845741


# LSTM Modell

In [29]:
model1 = Sequential()
model1.add(LSTM(512, input_shape=[1,1024]))
model1.add(Dense(3))
model1.add(Activation(activations.sigmoid))

opt = Adam(lr=0.0002, beta_1=0.5)
model1.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 512)               3147776   
_________________________________________________________________
dense (Dense)                (None, 3)                 1539      
_________________________________________________________________
activation (Activation)      (None, 3)                 0         
Total params: 3,149,315
Trainable params: 3,149,315
Non-trainable params: 0
_________________________________________________________________


In [30]:
train_text_shaped = elmo_train_new.reshape(-1, 1, 1024)
test_text_shaped = elmo_test_new.reshape(-1, 1, 1024)

train_encoded_sentiment = pd.get_dummies(df_train, columns=["sentiment"]).drop(labels='text', axis=1)
test_encoded_sentiment = pd.get_dummies(df_test, columns=["sentiment"]).drop(labels='text', axis=1)


In [31]:
history = model1.fit(train_text_shaped,train_encoded_sentiment,epochs=5, validation_data=(test_text_shaped,test_encoded_sentiment))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Evaluation

In [32]:
y_pred1 = model.predict(test_text_shaped)
y_pred = np.argmax(y_pred1, axis=1)

In [33]:
from sklearn.metrics import f1_score, accuracy_score

print(f1_score(df_test.sentiment.apply(lambda x: x + 1), y_pred, average=None))
print(accuracy_score(df_test.sentiment.apply(lambda x: x + 1), y_pred))

[0.90360031 0.96852898 0.95263917]
0.9536115833645791
