# ELMo for Sentiment Analysis

# Imports

In [None]:
import numpy as np
import pandas as pd
import string
import re
import keras
import nltk
import time
import tensorflow_hub as hub
import tensorflow as tf
import pickle

from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Activation
from tensorflow.keras import activations

from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# Read files
Only execute one of these

## Apple Sentiment

In [None]:
data = pd.read_csv("data/sentiment/datasets_652925_1154930_apple-twitter-sentiment-texts.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## US Airline Sentiment

In [None]:
data = pd.read_csv("data/sentiment/Tweets.csv")

data = data[['text', 'airline_sentiment']]
data.rename({'airline_sentiment' : 'sentiment'}, inplace=True)

thisdict =	{
  "negative": -1,
  "neutral": 0,
  "positive": 1
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])


## T4SA

In [None]:
tweets = pd.read_csv("data/sentiment/raw_tweets_text.csv")
sentiments = pd.read_csv(".data/sentiment/t4sa_text_sentiment.csv",delimiter = "\t")

tweets.set_index(tweets.id, inplace=True)
sentiments.set_index(sentiments.TWID, inplace=True)
data=tweets.join(sentiments)
data.dropna(inplace=True)
data.drop(columns=['id', 'TWID'], inplace=True)
data["sentiment"] = data[['NEU', 'NEG', 'POS']].idxmax(axis=1)

data = data[['text', 'sentiment']]

thisdict =	{
  "NEG": -1,
  "NEU": 0,
  "POS": 1
}
data.sentiment = data.sentiment.apply(lambda x: thisdict[x])



General Text Cleaning

In [None]:
data.text = data.text.str.lower()

data.text = data.text.apply(lambda x:re.sub(r'http\S+', '', x))

tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
data.text = data.text.apply(lambda x: tokenizer.tokenize(x))

data.text = data.text.apply(lambda x: ' '.join(x))

data.text = data.text.map(lambda x : x.translate(str.maketrans('', '', string.punctuation)))

data.text = data.text.str.replace("[0-9]", " ")

data.text = data.text.str.strip(string.whitespace)

df_train, df_test = train_test_split(data, test_size=0.33, random_state=42)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
df_train

Unnamed: 0,text,sentiment
0,we need more products like companies like ...,0
1,legit thought that was you in this pic,0
2,aapl rt alex gauna flips his apple bit sets s...,0
3,here are the main differences between apple ca...,0
4,trade aapl free nightly updates are posted he...,0
...,...,...
1087,i kinda feel sorry for tho goodjob with your ...,-1
1088,your ipad game just got shut way down like ...,-1
1089,been waiting days so far for to approve the ...,-1
1090,my tmobile apple cellular hell iphone iphon...,-1


# Word Embeddings

In [None]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")

Only one vector for an input sequence

In [None]:
@tf.function
def elmo_vectors(x):
  embeddings = elmo.signatures["default"](x)["elmo"]

  # return average of ELMo features
  return tf.reduce_mean(embeddings,1)

Create Batches

In [None]:
list_train = [df_train.text[i:i+100] for i in range(0,df_train.shape[0],100)]
list_test = [df_test.text[i:i+100] for i in range(0,df_test.shape[0],100)]


In [None]:
len(list_train)

10720

In [None]:
start = time.time()
elmo_train = [elmo_vectors(tf.constant(x.tolist())) for x in list_train]
elmo_test = [elmo_vectors(tf.constant(x.tolist())) for x in list_test]

print("Time elapsed: ", (time.time() - start) / 60)

Time elapsed:  0.1433647592862447


In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [None]:
elmo_train_new.shape

NameError: ignored

Save Word vectors

In [None]:
# save elmo_train_new
pickle_out = open("data/elmo_train_01092020_1.pickle","wb")
pickle.dump(elmo_train_new[:len(elmo_train_new) // 2], pickle_out)
pickle_out.close()
pickle_out = open("data/elmo_train_01092020_2.pickle","wb")
pickle.dump(elmo_train_new[len(elmo_train_new) // 2 + 1 :], pickle_out)
pickle_out.close()

# save elmo_test_new (too big, cut in smaller pieces)
pickle_out = open("data/elmo_test_30082020.pickle","wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

Load word vectors

In [None]:
# load elmo_train_new
pickle_in = open("data/elmo_train_29082020.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("data/elmo_test_29082020.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

# Logistic Regression

In [None]:
lreg = LogisticRegression(max_iter=1000)
lreg.fit(elmo_train_new, df_test.sentiment)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Evaluation

In [None]:
start = time.time()
preds_test = lreg.predict(elmo_test_new)
print("Time elapsed: ", (time.time() - start) / 60)

Time elapsed:  0.00015242894490559897


In [None]:

print(metrics.f1_score(test_sentiment_column, preds_test, average=None))
print(metrics.accuracy_score(test_sentiment_column, preds_test))

[0.85651214 0.86225403 0.5       ]
0.8382899628252788


# LSTM Modell

In [None]:
model = Sequential()
model.add(LSTM(512, input_shape=[1,1024]))
model.add(Dense(3))
model.add(Activation(activations.sigmoid))

opt = Adam(lr=0.0002, beta_1=0.5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 512)               3147776   
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 1539      
_________________________________________________________________
activation_1 (Activation)    (None, 3)                 0         
Total params: 3,149,315
Trainable params: 3,149,315
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_text_shaped = elmo_train_new.reshape(-1, 1, 1024)
test_text_shaped = elmo_test_new.reshape(-1, 1, 1024)

train_encoded_sentiment = pd.get_dummies(df_train, columns=["sentiment"]).drop(labels='text', axis=1)
test_encoded_sentiment = pd.get_dummies(df_test, columns=["sentiment"]).drop(labels='text', axis=1)


In [None]:
history = model.fit(train_text_shaped,train_encoded_sentiment,epochs=20, validation_data=(test_text_shaped,test_encoded_sentiment))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Evaluation

In [None]:
y_pred1 = model.predict(test_text_shaped)
y_pred = np.argmax(y_pred1, axis=1)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

print(f1_score(df_test.sentiment.apply(lambda x: x + 1), y_pred, average=None))
print(accuracy_score(df_test.sentiment.apply(lambda x: x + 1), y_pred))

[0.84322034 0.84501845 0.41935484]
0.8197026022304833
