In [176]:
import pandas as pd
import re
import numpy as np
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize 

from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
#nltk.download()

In [177]:
from keras.layers import LSTM, Convolution1D, Flatten, Dropout, Dense, MaxPool1D
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [178]:
df = pd.read_csv('./train_twitter.csv')
df_test = pd.read_csv('./test_tweets.csv')

In [179]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [180]:
df_test.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


## Pre-processing

In [181]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

In [182]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*") # remove all @user
df_test['tidy_tweet'] = np.vectorize(remove_pattern)(df_test['tweet'], "@[\w]*")

In [183]:
# remove special characters, numbers, punctuations
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z]", " ")
df_test['tidy_tweet'] = df_test['tidy_tweet'].str.replace("[^a-zA-Z]", " ")

In [184]:
# keep tweets having more than 3 words in train set
df['tidy_tweet'] = df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) 

In [185]:
df.head(10)

Unnamed: 0,id,label,tweet,tidy_tweet
0,1,0,@user when a father is dysfunctional and is s...,when father dysfunctional selfish drags kids i...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cause they offer wheelchair...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model love take with time
4,5,0,factsguide: society now #motivation,factsguide society motivation
5,6,0,[2/2] huge fan fare and big talking before the...,huge fare talking before they leave chaos disp...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,next school year year exams think about that s...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,love land allin cavs champions cleveland cleve...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here


In [186]:
df_test.head(10)

Unnamed: 0,id,tweet,tidy_tweet
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedic...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see th...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystohe...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",rd bihday to my amazing hilarious nephew...
5,31968,choose to be :) #momtips,choose to be momtips
6,31969,something inside me dies ð¦ð¿â¨ eyes nes...,something inside me dies eyes nes...
7,31970,#finished#tattoo#inked#ink#loveitâ¤ï¸ #â¤ï¸...,finished tattoo inked ink loveit ...
8,31971,@user @user @user i will never understand why...,i will never understand why my dad left me...
9,31972,#delicious #food #lovelife #capetown mannaep...,delicious food lovelife capetown mannaep...


In [187]:
 # tokenizing
tokenized_tweet = df['tidy_tweet'].apply(lambda x:  word_tokenize(x))
tokenized_tweet_test = df_test['tidy_tweet'].apply(lambda x:  word_tokenize(x))

In [188]:
# removing stop words
stop = set(stopwords.words('english'))
tokenized_tweet = tokenized_tweet.apply(lambda x : [i for i in x if i not in stop])
tokenized_tweet_test = tokenized_tweet_test.apply(lambda x : [i for i in x if i not in stop])

In [189]:
tokenized_tweet.head()

0    [father, dysfunctional, selfish, drags, kids, ...
1    [thanks, lyft, credit, cause, offer, wheelchai...
2                                    [bihday, majesty]
3                            [model, love, take, time]
4                    [factsguide, society, motivation]
Name: tidy_tweet, dtype: object

In [190]:
# Lemmetizing
lmtzr = WordNetLemmatizer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [lmtzr.lemmatize(i) for i in x]) # stemming
tokenized_tweet_test = tokenized_tweet_test.apply(lambda x: [lmtzr.lemmatize(i) for i in x])
tokenized_tweet.head()

0    [father, dysfunctional, selfish, drag, kid, dy...
1    [thanks, lyft, credit, cause, offer, wheelchai...
2                                    [bihday, majesty]
3                            [model, love, take, time]
4                    [factsguide, society, motivation]
Name: tidy_tweet, dtype: object

In [191]:
# Stemming
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet_test = tokenized_tweet_test.apply(lambda x: [stemmer.stem(i) for i in x])
tokenized_tweet.head()


0     [father, dysfunct, selfish, drag, kid, dysfunct]
1    [thank, lyft, credit, caus, offer, wheelchair,...
2                                    [bihday, majesti]
3                            [model, love, take, time]
4                          [factsguid, societi, motiv]
Name: tidy_tweet, dtype: object

In [192]:
tokenized_tweet_test

0        [studiolif, aislif, requir, passion, dedic, wi...
1        [white, supremacist, want, everyon, see, new, ...
2        [safe, way, heal, acn, altwaystoh, healthi, heal]
3        [hp, curs, child, book, reserv, alreadi, ye, h...
4        [rd, bihday, amaz, hilari, nephew, eli, ahmir,...
5                                          [choos, momtip]
6        [someth, insid, dy, eye, ness, smokeyey, tire,...
7         [finish, tattoo, ink, ink, loveit, thank, aleee]
8        [never, understand, dad, left, young, deep, in...
9        [delici, food, lovelif, capetown, mannaepicur,...
10       [dayswast, narcosi, infinit, ep, make, awar, g...
11       [one, world, greatest, spo, event, leman, team...
12                        [half, way, websit, allgoingwel]
13       [good, food, good, life, enjoy, call, garlic, ...
14       [stand, behind, guncontrolpleas, senselessshoo...
15       [ate, ate, ate, jamaisasthi, fish, curri, praw...
16             [got, limit, edit, rain, shine, set, toda

In [193]:
df['tidy_tweet'] = tokenized_tweet
df_test['tidy_tweet'] = tokenized_tweet_test

In [194]:
token = Tokenizer()

In [195]:
# using all train and test set examples to cover whole vocabulary
token.fit_on_texts(pd.concat([df['tidy_tweet'],df_test['tidy_tweet']])) 

In [196]:
# index tokens
train_tokens = token.texts_to_sequences(df['tidy_tweet'])
test_tokens = token.texts_to_sequences(df_test['tidy_tweet'])

In [197]:
# pad each tweet so that all tweets have equal words
X_train = pad_sequences(train_tokens, maxlen=30)
X_test = pad_sequences(test_tokens, maxlen=30)

In [199]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   11, 5690, 2208, 1776,  124, 5690], dtype=int32)

In [200]:
vocab = len(token.word_counts)

In [201]:
y = df['label']

# Train model and predict

In [202]:
# sample train and validation sets
X_train1 = X_train[:25000,:] 
X_val = X_train[25000:,:]

y_train1 = y[:25000]
y_val = y[25000:]

In [215]:
# Training Embedding and LSTM layers
embedding_vector_length = 300 
model = Sequential() 
model.add(Embedding(vocab + 1, embedding_vector_length,input_length=30)) 
model.add(LSTM(100))

model.add(Dense(1, activation='sigmoid')) 

model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model.fit(X_train1, y_train1, validation_data=(X_val, y_val), nb_epoch=3, batch_size=512) 

Train on 25000 samples, validate on 6962 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1b198c05c0>

In [216]:
sum(model.predict_classes(X_test))

array([1015], dtype=int32)

In [217]:
f1_score(y_val, model.predict_classes(X_val))

0.6589595375722542

In [218]:
pd.DataFrame(model.predict_classes(X_test)).to_csv('pred_twitter_lstm.csv') # Final predictions on test set