### Left to Do:
#### Map the IMBD set to the glove embeddings
#### split the twitter set into 10k-folds
#### fine-tune the IMBD set with each split as an epoch and storing the predictions on the test fold(reverting to the BaseModel for each fold)
#### calculate roc_auc_score for the methodology

In [26]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model
from keras.datasets import imdb
from keras.layers.embeddings import Embedding
import pandas as pd
import numpy as np

In [27]:
#Use large dataset embeddings
embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [28]:
ds = pd.read_csv('sentiment.tsv', delimiter='\t', header=None, names=['label', 'text'])
ds.head()

Unnamed: 0,label,text
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,pos,having a vodka tonic and looking forward to go...
2,pos,@ddlovatofans1neg1 Could you follow me please....
3,pos,@jordanknight for once.................. PLEAS...
4,neg,Had a dream about a walk in fast food resturau...


In [29]:
#Prepare the data to evaluate
Y = pd.get_dummies(ds['label'], drop_first=True).values
texts = ds['text'].values

In [30]:
#Get some stats on the text
all_text = []
sequence_lengths = []
for i in texts:
    wordlist = i.split()
    sequence_lengths.append(len(wordlist))
    all_text.extend(wordlist)
print "Unique 'Words':", len(np.unique(all_text))

Unique 'Words': 8639


In [31]:
#Set Params
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 50

In [32]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print 'Shape of data tensor:', X.shape
print 'Shape of label tensor:', Y.shape

Shape of data tensor: (2001, 500)
Shape of label tensor: (2001, 1)


In [33]:
#Split to train and Test (For original testing only)
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
Y = Y[indices]

In [34]:
#Create the Twitter Embedding Matrix
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print len(word_index)

6105


In [35]:
# load the IMBD dataset to use as the BaseModel
top_words = 5000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
# pad dataset to a maximum review length in words
max_words = 500
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

In [36]:
# create the BaseModel
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Fit the BaseModel
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               2000250   
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
Total params: 2,163,605
Trainable params: 2,163,605
Non-trainable params: 0
_________________________________________________________________
