<a href="https://colab.research.google.com/github/hydradon/modelling-with-pythhon/blob/master/ELMo_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==1.15
!pip install "tensorflow_hub>=0.6.0"
!pip3 install tensorflow_text==1.15

In [12]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle

pd.set_option('display.max_colwidth', 200)

In [13]:
# Import data
train = pd.read_csv("https://raw.githubusercontent.com/hydradon/modelling-with-pythhon/master/dataset/tweet_sentiments/train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/hydradon/modelling-with-pythhon/master/dataset/tweet_sentiments/test.csv")

# sample view
train.sample(10)

Unnamed: 0,id,label,tweet
5458,5459,0,P I N K M O N D A Y. #monday #work #truefruits #evian #pink #fruits #pinkmonday #apple… https://www.instagram.com/p/BBzVaFzQaAT/
2084,2085,1,@Vivo_India @UnboxTherapy Scamsters! Advertised a sale. Site went down 10 min's before the sale. Came back up & all gone. http://www.isitdownrightnow.com reported site down for 10 mins. #vivofraud...
3624,3625,0,iTunes sucks. So does Apple. I wish I could switch my fricken phone. #itunes #apple
5002,5003,0,Thought this was #dope #tattoo #art #amazing #fffound #color #nice #body #life #iphone #filter http://instagr.am/p/L_cgq/
3513,3514,0,Happy Thanks Giving #phone #blackberry #samsung #apple #thanks #giving #share #family #muchl http://instagr.am/p/SXdIqjGspz/
4068,4069,0,#gadget #apple #ipad #mini #tablet #new #instatech #friday #2013 #january #ios gonna love this thing! http://instagr.am/p/UoMdCkD5WK/
924,925,1,"@blackeysbeats mine didn't even have the decency to split, it just concertina'd and died pic.twitter.com/W2WZ5S0k7Z"
7577,7578,0,my response to ios11 :) #apple #iphone #iphonex #losingfaith #ios11 #samsung #note8 #win #finallymadetheswitchpic.twitter.com/8OKfAOdEvS
6209,6210,0,My phone #samsung #blackberry #instagram #dailyphoto #instagrammers #new #now #doubletap… http://instagram.com/p/Zt_ZDhikEl/
5233,5234,1,Need to get a new phone *sighhh* my like millionth one :/


In [14]:
# Check the distribution of labels
train["label"].value_counts(normalize = True)

0    0.744192
1    0.255808
Name: label, dtype: float64

## Cleaning data

In [15]:
# Miscellanenous
def clean_text(s: str) -> str:
    s = re.sub(r'http\S+', '', s) # Remove URL

    punctuation = set('!"#$%&()*+-/:;<=>?@[\\]^_`{|}~')
    s = ''.join([ch for ch in s if ch not in punctuation]) # Remove all punctuation

    s = re.sub(r'\d', ' ', s) # Remove number

    s = " ".join(s.split()) # Remove continuous whitespace

    return s.lower()

train["processed_tweet"] = train["tweet"].apply(clean_text)
test["processed_tweet"] = test["tweet"].apply(clean_text)

In [16]:
# Lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

train["processed_tweet"] = lemmatization(train["processed_tweet"])
test["processed_tweet"] = lemmatization(test["processed_tweet"])

In [17]:
# View sample
train.sample(10)

Unnamed: 0,id,label,tweet,processed_tweet
2108,2109,1,"Thank you for the unnecessary headache, Apple. You and iCloud suck. #annoyed #wasteoftime Apple","thank -PRON- for the unnecessary headache , apple . -PRON- and icloud suck . annoyed wasteoftime apple"
762,763,1,why does my Itunes think its acceptable to be a $&@*# at this time #itsnot #apple #prick,why do -PRON- itune think -PRON- acceptable to be a at this time itsnot apple prick
1504,1505,0,"#selfie pose,see yourself & feel happy..it is important to be and yourself #Samsung #dubai pic.twitter.com/CnaUW3IqF4","selfie pose , see -PRON- feel happy .. -PRON- be important to be and -PRON- samsung dubai pic.twitter.comcnauw iqf"
4257,4258,1,#fuckYou #apple I'll #buy another though #lightningcable #fail 6'ter isn't #cheap https://www.instagram.com/p/BCT4r-fCgJZ/,fuckyou apple -PRON- will buy another though lightningcable fail ' ter be not cheap
7784,7785,0,"Crazy kid, always getting into something #instagram #iphone #4s #kittie #can #main$&@*# #meow #b http://instagr.am/p/IwhVL9A5Zl/","crazy kid , always get into something instagram iphone s kittie can main meow b"
6296,6297,0,"RT: @alibakes Okay, I'm just going to reestablish how much I love my iPhone. Srsly. #iPhone #AT&T #geek: R.. http://bit.ly/v4kfP","rt alibake okay , -PRON- be just go to reestablish how much i love -PRON- iphone . srsly . iphone att geek r .."
2408,2409,0,Just for fun #me #iPhone #new #instagram #white #instalove #instamood #instagood http://instagr.am/p/ORX6tIGUP5/,just for fun -PRON- iphone new instagram white instalove instamood instagood
4657,4658,1,Great I got to wait a whole hour to talk to someone about my iPhone !!!,great i get to wait a whole hour to talk to someone about -PRON- iphone
3954,3955,0,"Finally ! My first very own iPhone ""iPhone X"". Imma proud momma ! #worthit #smartphone #iphone #iphonex #workhard #payoff #happiness #technology #future #gadget #cool #legit #fire #onfire #awesome...",finally -PRON- first very own iphone iphone x. imma proud momma worthit smartphone iphone iphonex workhard payoff happiness technology future gadget cool legit fire onfire awesome awesomeness ly p...
782,783,1,I would like to give a big old #fuckyou to #Apple due to the fact my ipad and iPhone will not connect,i would like to give a big old fuckyou to apple due to the fact -PRON- ipad and iphone will not connect


## Building model

In [18]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [19]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


### Building ELMo vectors for all Tweets

In [23]:
def get_elmo_vectors(x: pd.Series):
    embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())

        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

# Split data into batches
train_batches = [train[i:i+100] for i in range(0, len(train), 100)]
test_batches = [test[i:i+100] for i in range(0, len(test), 100)]

In [None]:
# Get embedding
elmo_train = [get_elmo_vectors(x['processed_tweet']) for x in train_batches]
elmo_test = [get_elmo_vectors(x['processed_tweet']) for x in test_batches]

In [33]:
# Concat all results from batch training
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

## Save models to file

In [34]:
import time
timestr = time.strftime("%Y%m%d")

# save elmo_train_new
pickle_out = open("elmo_train_{}.pickle".format(timestr),"wb")
pickle.dump(elmo_train_new, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_test_{}.pickle".format(timestr),"wb")
pickle.dump(elmo_test_new, pickle_out)
pickle_out.close()

In [None]:
# load elmo_train_new
pickle_in = open("elmo_train_03032019.pickle", "rb")
elmo_train_new = pickle.load(pickle_in)

# load elmo_train_new
pickle_in = open("elmo_test_03032019.pickle", "rb")
elmo_test_new = pickle.load(pickle_in)

## Build model with ELMo embeddings

In [35]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(elmo_train_new, 
                                                      train['label'],  
                                                      random_state=42, 
                                                      test_size=0.2)

In [36]:
# Baseline score using LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_valid)
print("F1 score of Logistic Regression model: {:2f}".format(f1_score(y_pred, y_valid)))

F1 score of Logistic Regression model: 0.762590


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# Prediction using ELMo
