# Neural Networks with Word2Vec

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from nltk import word_tokenize
from gensim.models import word2vec

In [2]:
df_tri = pd.read_csv('branded_tweets_clean.csv')
df_bin = pd.read_csv('binary_tweets.csv')

df_tri.drop(columns=['hashtag_count', 'mention_count', 'weblink_count', 
                     'tweet_length', 'punct_count'], inplace=True)
df_bin.drop(columns=['hashtag_count', 'mention_count', 'weblink_count', 
                     'tweet_length', 'punct_count'], inplace=True)

def clean_string(text):
    clean_text = text.replace("[",'').replace("]",'').replace("'",'').replace(",",'')
    return clean_text

df_bin['tweet_tokens'] = df_bin['tweet_tokens'].map(clean_string)
df_tri['tweet_tokens'] = df_tri['tweet_tokens'].map(clean_string)

In [3]:
df_tri.head()

Unnamed: 0,tweet,brand_product,sentiment,tweet_tokens
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,0,wesley83 3g iphone 3 hr tweeting riseaustin de...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,1,jessedee know fludapp awesome ipadiphone app y...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,1,swonderlin wait ipad 2 also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,0,sxsw hope year festival isnt crashy year iphon...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1,sxtxstate great stuff fri sxsw marissa mayer g...


In [4]:
df_bin.head()

Unnamed: 0,tweet,brand_product,sentiment,tweet_tokens
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,0,wesley83 3g iphone 3 hr tweeting riseaustin de...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,1,jessedee know fludapp awesome ipadiphone app y...
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,1,swonderlin wait ipad 2 also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,0,sxsw hope year festival isnt crashy year iphon...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1,sxtxstate great stuff fri sxsw marissa mayer g...


In [5]:
bag_of_words = df_bin['tweet_tokens']
words_list = []
for string_ele in bag_of_words:
    split_up_words = string_ele.split()
    for i in range(0, len(split_up_words)):
        words_list.append(split_up_words[i])
        
total_vocabulary = set(words_list)

In [6]:
len(words_list)

101802

In [7]:
len(total_vocabulary)

8929

In [39]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras import regularizers
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

In [9]:
y_bin = df_bin['sentiment']
y_tri = pd.get_dummies(df_tri['sentiment']).values

In [51]:
tokenizer_bin = text.Tokenizer(num_words=6500)
tokenizer_bin.fit_on_texts(list(df_bin['tweet_tokens']))
list_tokenized_tweets_bin = tokenizer_bin.texts_to_sequences(df_bin['tweet_tokens'])
X_bin = sequence.pad_sequences(list_tokenized_tweets_bin)

In [52]:
model_bin = Sequential()

In [53]:
embedding_size = 100
model_bin.add(Embedding(6500, embedding_size))
model_bin.add(LSTM(30))
model_bin.add(Dropout(0.5))
model_bin.add(Dense(20, activation='relu'))
model_bin.add(Dropout(0.5))
model_bin.add(Dense(1, activation='sigmoid'))

In [54]:
model_bin.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [55]:
model_bin.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, None, 100)         650000    
_________________________________________________________________
lstm_13 (LSTM)               (None, 30)                15720     
_________________________________________________________________
dropout_28 (Dropout)         (None, 30)                0         
_________________________________________________________________
dense_27 (Dense)             (None, 20)                620       
_________________________________________________________________
dropout_29 (Dropout)         (None, 20)                0         
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 21        
Total params: 666,361
Trainable params: 666,361
Non-trainable params: 0
_________________________________________________________________


In [56]:
model_bin.fit(X_bin, y_bin, epochs=40, batch_size=256, validation_split=0.3)

Train on 5831 samples, validate on 2499 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fb2b69c9438>

In [57]:
model_bin2 = Sequential()
model_bin2.add(Embedding(6500, 32))
model_bin2.add(LSTM(15))
model_bin2.add(Dropout(0.5))
model_bin2.add(Dense(10, activation='relu'))
model_bin2.add(Dropout(0.5))
model_bin2.add(Dense(1, activation='sigmoid'))

In [58]:
model_bin2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [59]:
model_bin2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 32)          208000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 15)                2880      
_________________________________________________________________
dropout_30 (Dropout)         (None, 15)                0         
_________________________________________________________________
dense_29 (Dense)             (None, 10)                160       
_________________________________________________________________
dropout_31 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 11        
Total params: 211,051
Trainable params: 211,051
Non-trainable params: 0
_________________________________________________________________


In [60]:
model_bin2.fit(X_bin, y_bin, epochs=30, batch_size=256, validation_split=0.3)

Train on 5831 samples, validate on 2499 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fb2b98f2ac8>

In [64]:
model_bin3 = Sequential()
model_bin3.add(Embedding(6500, 16))
model_bin3.add(LSTM(10))
model_bin3.add(Dropout(0.5))
model_bin3.add(Dense(5, activation='relu'))
model_bin3.add(Dropout(0.5))
model_bin3.add(Dense(1, activation='sigmoid'))

In [65]:
model_bin3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bin3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, None, 16)          104000    
_________________________________________________________________
lstm_16 (LSTM)               (None, 10)                1080      
_________________________________________________________________
dropout_34 (Dropout)         (None, 10)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 5)                 55        
_________________________________________________________________
dropout_35 (Dropout)         (None, 5)                 0         
_________________________________________________________________
dense_34 (Dense)             (None, 1)                 6         
Total params: 105,141
Trainable params: 105,141
Non-trainable params: 0
_________________________________________________________________


In [66]:
model_bin3.fit(X_bin, y_bin, epochs=30, batch_size=256, validation_split=0.3)

Train on 5831 samples, validate on 2499 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fb2b98bc860>

In [85]:
model_bin4 = Sequential()
model_bin4.add(Embedding(6500, 16))
model_bin4.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l1(0.005)))
model_bin4.add(Dropout(0.5))
model_bin4.add(LSTM(5))
model_bin4.add(Dropout(0.3))
model_bin4.add(Dense(4, activation='relu', kernel_regularizer=regularizers.l1(0.005)))
model_bin4.add(Dropout(0.3))
model_bin4.add(Dense(1, activation='sigmoid'))

In [86]:
model_bin4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_bin4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, None, 16)          104000    
_________________________________________________________________
dense_53 (Dense)             (None, None, 8)           136       
_________________________________________________________________
dropout_53 (Dropout)         (None, None, 8)           0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 5)                 280       
_________________________________________________________________
dropout_54 (Dropout)         (None, 5)                 0         
_________________________________________________________________
dense_54 (Dense)             (None, 4)                 24        
_________________________________________________________________
dropout_55 (Dropout)         (None, 4)                 0         
__________

In [92]:
model_bin4.fit(X_bin, y_bin, epochs=25, batch_size=150, validation_split=0.25)

Train on 6247 samples, validate on 2083 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fb2a95a99e8>