In [1]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm
import re
import time
import pickle

pd.set_option('display.max_colwidth', 200)

In [2]:
train = pd.read_csv('dataset/train_2kmZucJ.csv')
test = pd.read_csv('dataset/test_oJQbWVk.csv')

train.shape, test.shape

((7920, 3), (1953, 2))

In [3]:
train['label'].value_counts(normalize= True)

0    0.744192
1    0.255808
Name: label, dtype: float64

In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [5]:
train['clean_tweet'] = train['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
test['clean_tweet'] = test['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))

In [6]:
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,#fingerprint #Pregnancy Test #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias…
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect...
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [7]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'

train['clean_tweet'] = train['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
train['clean_tweet'] = train['clean_tweet'].str.lower()
test['clean_tweet'] = test['clean_tweet'].str.lower()

# remove numbers
train['clean_tweet'] = train['clean_tweet'].str.replace("[0-9]", " ")
test['clean_tweet'] = test['clean_tweet'].str.replace("[0-9]", " ")

# remove whitespaces
train['clean_tweet'] = train['clean_tweet'].apply(lambda x:' '.join(x.split()))
test['clean_tweet'] = test['clean_tweet'].apply(lambda x: ' '.join(x.split()))

In [8]:
train.head()

Unnamed: 0,id,label,tweet,clean_tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone,fingerprint pregnancy test android apps beautiful cute health igers iphoneonly iphonesia iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/,finally a transparant silicon case thanks to my uncle yay sony xperia s sonyexperias…
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu,we love this would you go talk makememories unplug relax iphone smartphone wifi connect...
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/,i'm wired i know i'm george i was made that way iphone cute daventry home
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!,what amazing service apple won't even talk to me about a question i have unless i pay them . for their stupid support


In [9]:
# load spaCy's language model
nlp = spacy.load('en', disable=['parser', 'ner'])

# function to lemmatize text
def lemmatization(texts):
    output = []
    for i in texts:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

In [10]:
train['clean_tweet'] = lemmatization(train['clean_tweet'])
test['clean_tweet'] = lemmatization(test['clean_tweet'])

In [11]:
train.sample(10)

Unnamed: 0,id,label,tweet,clean_tweet
883,884,0,Got that 10 sub goal! Unbelievable thank you everyone for the support http://www.twitch.tv/jdkong905 #twitch #twitchtv #twitchstreamer #twitchaffiliate #prime #subscribe #follow #hype #ps4 #gaming...,get that sub goal unbelievable thank -PRON- everyone for the support twitch twitchtv twitchstreamer twitchaffiliate prime subscribe follow hype ps gaming blessed game sony thankyou smallstep
6396,6397,1,So I've updated my iPod. But I can't see anything new or amazing that justifies the arduous amount of time I took doing so.,so -PRON- have update -PRON- ipod . but i can not see anything new or amazing that justify the arduous amount of time i take do so .
689,690,0,New phone :-) me #me #thakyousomuchdad #mypic #minenow #myphone #samsung #samsunggalaxy… http://instagram.com/p/lhqPw6lK-M/,new phone -PRON- -PRON- thakyousomuchdad mypic minenow myphone samsung samsunggalaxy …
1610,1611,0,Shine wherever you go #light #shine #glow #newyear #newchapter #girlpower #iphonex… https://www.instagram.com/p/BdqPAtZlKY9/,shine wherever -PRON- go light shine glow newyear newchapter girlpower iphonex …
7913,7914,1,Ok so my galaxy crashed after one day now I have to wait til Monday for my skyrocket but using iPhone for now...,ok so -PRON- galaxy crash after one day now i have to wait til monday for -PRON- skyrocket but use iphone for now ...
6190,6191,0,The Grab (Next in #naruto) . #kids #cute #nex5n #sony #saudi #ksa @ DQ Grill & Chill http://instagr.am/p/QE5UbnDZ5Y/,the grab next in naruto . kid cute nex n sony saudi ksa dq grill chill
5665,5666,0,"Enjoy and thanks... #apple team #thanks (with Aditya, utit, and Debbie at Lor In New Kuta Hotel) [pic] — https://path.com/p/4v7khK","enjoy and thank ... apple team thank with aditya , utit , and debbie at lor in new kuta hotel pic —"
1683,1684,1,How Apple can patent pinch to zoom when seing DiamondTouch made in 2001 by Mitsubishi http://bit.ly/PjbJCG #apple,how apple can patent pinch to zoom when see diamondtouch make in by mitsubishi apple
810,811,0,Track of the Day: ‘Ultraviolet (Light My Way)’ #news #photography #fashion #health #fail #tech #ipad #iphone … pic.twitter.com/ff4OC2tG4u,track of the day ' ultraviolet light -PRON- way ' news photography fashion health fail tech ipad iphone … pic.twitter.comff oc tg u
2899,2900,0,Happy weekend everyone #fridaypic a new project! #weekend #ice #fun #fridge #a7r #maxgennel #sony... http://fb.me/1akjuzPQH,happy weekend everyone fridaypic a new project weekend ice fun fridge a r maxgennel sony ...


In [12]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)