In [51]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Dropout
from keras.models import Model
from keras.datasets import imdb
from keras.layers.embeddings import Embedding
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from keras.models import model_from_json
from keras.optimizers import Adam, Adamax
import pandas as pd
import numpy as np
import os
import sys
import re

In [21]:
ds = pd.read_csv('sentiment.tsv', delimiter='\t', header=None, names=['label', 'text'])
ds.head()

Unnamed: 0,label,text
0,neg,"@jamielewislewis i cant believe it, it really ..."
1,pos,having a vodka tonic and looking forward to go...
2,pos,@ddlovatofans1neg1 Could you follow me please....
3,pos,@jordanknight for once.................. PLEAS...
4,neg,Had a dream about a walk in fast food resturau...


In [22]:
#Prepare the data to evaluate
score = pd.get_dummies(ds['label'], drop_first=True).values
Y = np.ravel(score)
texts = ds['text'].values
#Convert twitter call-signs to the same word
#I was going to use 'fish' because it is singular and plural
#decided to use person because I thought it would match the embedding vectors better
CleanText = []
for z in texts:
    T = re.sub(r'[^@\s]*@\S*', 'person', z)
    CleanText.append(T)
CleanText = np.asarray(CleanText)

In [23]:
#Set Params
MAX_NB_WORDS = 6105
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM = 50
IMBD_TRAINING_DIR = "./test"
IMBD_TESTING_DIR = "./train"

In [24]:
def get_text_samples(TEXT_DATA_DIR):
    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []
    for name in sorted(os.listdir(TEXT_DATA_DIR)):
        path = os.path.join(TEXT_DATA_DIR, name)
        if os.path.isdir(path):
            label_id = len(labels_index)
            labels_index[name] = label_id
            for fname in sorted(os.listdir(path)):
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)
       
    return np.asarray(texts), np.asarray(labels) 

In [25]:
Ximbd_test, Yimbd_test = get_text_samples(IMBD_TESTING_DIR)
Ximbd_train, Yimbd_train = get_text_samples(IMBD_TRAINING_DIR)

In [26]:
#Create a word_index for all words in both datasets
ALLTEXTS = np.concatenate((Ximbd_test, Ximbd_train, CleanText))
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(ALLTEXTS)

#Create numerical representation of text
def get_sequences(texts, tokenizer):
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    X = pad_sequences(sequences, maxlen=500)
    return X
XTwitter = get_sequences(CleanText, tokenizer)
Ximbd_test = get_sequences(Ximbd_test, tokenizer)
Ximbd_train = get_sequences(Ximbd_train, tokenizer)

In [27]:
#Check to make sure that your tensors are shaped properly
print 'Shape of data tensor:', XTwitter.shape
print 'Shape of label tensor:', Y.shape
print 'Shape of IMBD tensor:', Ximbd_test.shape
print 'Shape of IMBD label tensor:', Yimbd_test.shape

#Get the embedding size
print 'Max', XTwitter.max()
print 'Max', Ximbd_test.max()
print 'Max', Ximbd_train.max()

Shape of data tensor: (2001, 500)
Shape of label tensor: (2001,)
Shape of IMBD tensor: (25000, 500)
Shape of IMBD label tensor: (25000,)
Max 125257
Max 125285
Max 125289


In [61]:
## create the BaseModel
'''
model = Sequential()
model.add(Embedding(125290, 32, input_length=500))
model.add(Conv1D(64, 3, padding='same'))
model.add(Conv1D(32, 3, padding='same'))
model.add(Conv1D(16, 3, padding='same'))
model.add(Flatten())
model.add(Dropout(0.1))
model.add(Dense(180,activation='sigmoid'))
model.add(Dropout(0.1))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
'''

model = Sequential()
model.add(Embedding(125290, 50, input_length=500))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


'''
model = Sequential()
model.add(Embedding(125290, 100, input_length=500))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
'''

print model.summary()
# Fit the BaseModel
model.fit(Ximbd_train, Yimbd_train, validation_data=(Ximbd_test, Yimbd_test), epochs=2, batch_size=64, verbose=1)
# Final evaluation of the model
scores = model.evaluate(Ximbd_test, Yimbd_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 500, 50)           6264500   
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 500, 32)           4832      
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 250)               2000250   
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 251       
Total params: 8,269,833
Trainable params: 8,269,833
Non-trainable params: 0
_________________________________________________________________


In [62]:
#save the model since you can't deep copy a NN while using Keras+Tensorflow
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.h5")
print"Saved model to disk"

Saved model to disk


In [63]:
#Run 1 epoch on each fold. Store Predictions. Evaluate AUC ROC
kf = KFold(n_splits=10, shuffle=True)
testPreds = np.zeros_like(Y, dtype=float)
counter = 1
for train_set, test_set in kf.split(XTwitter, Y):
    # load json and create model
    json_file = open('model.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)
    model.load_weights("model.h5")
    print "Loaded model from disk"
    print "Training Fold %s of 10" %str(counter)
    ADAM = Adamax(lr=.005)
    model.compile(loss='binary_crossentropy', optimizer=ADAM, metrics=['accuracy'])
    #run fit model to fold
    model.fit(XTwitter[train_set], Y[train_set], verbose=1, epochs=1, batch_size=4)
    testPreds[test_set] = model.predict(XTwitter[test_set])
    counter += 1
print roc_auc_score(Y, testPreds)


Loaded model from disk
Training Fold 1 of 10
Epoch 1/1




Loaded model from disk
Training Fold 2 of 10
Epoch 1/1
Loaded model from disk
Training Fold 3 of 10
Epoch 1/1
Loaded model from disk
Training Fold 4 of 10
Epoch 1/1
Loaded model from disk
Training Fold 5 of 10
Epoch 1/1
Loaded model from disk
Training Fold 6 of 10
Epoch 1/1
Loaded model from disk
Training Fold 7 of 10
Epoch 1/1
Loaded model from disk
Training Fold 8 of 10
Epoch 1/1
Loaded model from disk
Training Fold 9 of 10
Epoch 1/1
Loaded model from disk
Training Fold 10 of 10
Epoch 1/1
0.752469123466


In [None]:
#If you would rather use a different embedding you could use the wiki-set
'''
#Use large dataset embeddings
embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
'''