In [5]:
import keras

In [6]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
np.random.seed(7)

In [7]:
# load the dataset 
import pandas as pd

train_data = pd.read_csv('short_train.csv')
test_data = pd.read_csv('short_test.csv')


In [8]:
train_data.head()

Unnamed: 0,index,label,alpha,text
0,0,1,a,i wanna smoke a blunt w / her
1,1,1,a,NOT THE TANK OVER A WHITE SHIRT
2,2,1,a,Bruh ! ! She recorded her baby’s reaction to h...
3,3,1,a,i miss softball so much best time of my life
4,4,1,a,I will never forgive America for what they did...


In [9]:
test_data.head()

Unnamed: 0,index,label,alpha,text
0,0,1,a,Kawhi became suspicious . Cleared to play but ...
1,1,1,a,"Shi * , I amma sleep good . Rather he stays or..."
2,2,1,a,"To get to an athlete , appeal to his toughness..."
3,3,1,a,Should give you guys an anatomy chart . So you...
4,4,1,a,"My only point is , KD should have gotten a 2nd..."


In [10]:
# Split Data / Labels
X_train = []
Y_train = []
X_test = []
Y_test = []

for i in range(len(train_data['text'])):
    text = train_data['text'][i].split()
    label = train_data['label'][i]
    X_train.append(text)
    Y_train.append(label)

for i in range(len(test_data['text'])):
    text = test_data['text'][i].split()
    label = test_data['label'][i]
    X_test.append(text)
    Y_test.append(label)
    
Y_train = np.asarray(Y_train)
Y_test = np.asarray(Y_test)


In [11]:
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(X_train)
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)

In [17]:
vocab_size = len(t.word_counts) + 1

In [13]:
train_data['text'][0]

'i wanna smoke a blunt w / her '

In [14]:
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [18]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=3, batch_size=64)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           944320    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 997,621
Trainable params: 997,621
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 24960 samples, validate on 5362 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f3d02f8cad0>

In [19]:
# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.22%
