In [1]:
#!pip install spacy
#!pip install nltk
#!python3 -m spacy download en_core_web_sm

In [49]:
import argparse
import numpy as np
import os
from tensorflow.keras.callbacks import EarlyStopping

In [43]:
data_set = 'imdb'

print('Load %s data set ...' % data_set)
data_X = np.load('./../data/datasets/%s_X.npy' % data_set)
y = np.load('./../data/datasets/%s_y.npy' % data_set)

Load imdb data set ...


In [44]:
data_X[0]

'Soul Calibur is more solid than it ever was... with the new character creation, and the bad-ass chronicle of the sword mode on the home version.The arcade version is more complete, even though the character roster is smaller than the home version, this version is definitely the more pretty of the two, eliminating all of the "goofy/unrealistic" fighting styles found in the home version. If you were in any way disappointed with the home version, or perhaps thought it was "too much," you might find a much more likable and straight forward game of Soul Calibur in the arcade. Think you have what it takes to become a Legend?'

In [45]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_sm")

In [46]:
STOPLIST = set(stopwords.words('english'))

In [47]:
import string

SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

def tokenizeText(text):
    
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    
    tokens = nlp(text)
    
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    tokens = [tok for tok in tokens if tok.lower() not in STOPLIST]
    
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    tokens = [tok for tok in tokens if len(tok) >= 3]
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    #tokens = list(set(tokens))
    tokens = list((tokens))
    
    return ' '.join(tokens[:])

In [48]:
from tqdm import notebook
X_clean = [tokenizeText(x) for x in notebook.tqdm(data_X)]

  0%|          | 0/50000 [00:00<?, ?it/s]

In [50]:
np.save('X_clean_imdb', X_clean)

In [78]:
X_clean[0]

'soul calibur solid ever new character creation bad ass chronicle sword mode home arcade version complete even though character roster small home version version definitely pretty two eliminate goofy unrealistic fight style find home version way disappointed home version perhaps think much might find much likable straight forward game soul calibur arcade think take become legend'

In [79]:
## Tokeniser

In [80]:
num_words = 5000

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=num_words)

In [81]:
from sklearn.model_selection import train_test_split
def get_data():
    
    x_index = [i for i in range(len(X_clean))]
    X_train_index, X_test_index, y_train, y_test = train_test_split(x_index, y, test_size=0.5, random_state=42, stratify=y)
    X_train = np.array([X_clean[i] for i in X_train_index])
    X_test = np.array([X_clean[i] for i in X_test_index])
    
    tokenizer.fit_on_texts(X_train) 
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)
    
    X_pool = X_train
    y_pool = y_train

    return X_pool, y_pool, X_test, y_test

In [82]:
X_pool, y_pool, X_test, y_test = get_data()

In [120]:
ngram_range = 1
max_features = 5000
maxlen = 400
batch_size = 6
embedding_dims = 50
epochs = 10

In [121]:
from tensorflow.keras.preprocessing import sequence

print('Pad sequences (samples x time)...')
X_pool = sequence.pad_sequences(X_pool, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_pool.shape)
print('x_test shape:', X_test.shape)

Pad sequences (samples x time)...
x_train shape: (25000, 400)
x_test shape: (25000, 400)


In [122]:
## Model
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

class FastText(Model):

    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=2,
                 last_activation='softmax'):
        super(FastText, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.avg_pooling = GlobalAveragePooling1D()
        self.classifier = Dense(self.class_num, activation=self.last_activation)

        
    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of FastText must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of FastText must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        x = self.avg_pooling(embedding)
        output = self.classifier(x)
        return output

In [87]:
y_pool = np.array(y_pool)
y_test = np.array(y_test)

In [88]:
from tensorflow.keras.utils import to_categorical
y_pool = to_categorical(y_pool)
y_test = to_categorical(y_test)

In [130]:
model = FastText(maxlen, max_features, embedding_dims, class_num=2)
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

n = 500

early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(X_pool[:n], y_pool[:n],
          batch_size=batch_size,
          epochs=10,
          callbacks=[early_stopping],
          validation_data=(X_test[:n], y_test[:n]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x7f0634088880>