In [67]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
import json

from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, LSTM, Embedding, SpatialDropout1D
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import BatchNormalization, Activation, ZeroPadding2D

In [180]:
df_idf = pd.read_csv('amazon/reviews.csv')
df_dataset = pd.read_json('clothing_dataset/renttherunway_final_data.json', lines = True)

In [181]:
def pre_process(text):
    # to lowercase
    text=text.lower()
    
    # remove tags
    text = re.sub("&lt;/?.*?&gt;", "&lt;&gt; ", text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    
    return text

def get_stop_words(stop_file_path):
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [182]:
df_idf['text'] = df_idf['title'] + " " + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(str(x)))

df_dataset['text'] = df_dataset['review_summary'] + " " + df_dataset['review_text']
df_dataset['text'] = df_dataset['text'].apply(lambda x: pre_process(str(x)))

sub_dataset = df_dataset[['text', 'rating']]

In [183]:
all_data = df_idf['text'].append(sub_dataset['text'])
all_rating = df_idf['rating'].append(sub_dataset['rating'])

In [184]:
y = all_rating
y[:len(df_idf)] = y[:len(df_idf)].apply(lambda x: 1 if x > 3.5 else 0)#y.apply(lambda x: 1 if x > 3.5 else 0) 
y[len(df_idf):] = y[len(df_idf):].apply(lambda x: 1 if x > 5 else 0)#y.apply(lambda x: 1 if x > 3.5 else 0) 
y = y.to_numpy()

In [185]:
# Change this to use both datasets
#X_train, X_test, y_train, y_test = train_test_split(all_data, y, test_size=0.1, random_state=37)
# X_train = df_dataset['text']
# y_train = y[len(df_idf):]

# X_test = df_idf['text'].to_numpy()
# y_test = y[:len(df_idf)]

# print('# Train data samples:', X_train.shape)
# print('# Test data samples:', X_test.shape)

# print('Sample train', X_train[0])
# print('\nSample test', X_test[0])
# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)
# assert X_train.shape[0] == y_train.shape[0]
# assert X_test.shape[0] == y_test.shape[0]

In [186]:
MAX_LEN = 5260
GLOVE_DIM = 300
NB_WORDS = 49781

In [187]:
from keras.preprocessing.text import Tokenizer

tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split=" ")
tk.fit_on_texts(all_data)

all_data_seq = tk.texts_to_sequences(all_data)

In [188]:
X_train_seq = all_data_seq[len(df_idf):]
X_test_seq = all_data_seq[:len(df_idf)]

y_train = y[len(df_idf):]
y_test = y[:len(df_idf)]

In [189]:
seq_lengths = all_data.apply(lambda x: len(x.split(' ')))
print(seq_lengths.describe())

count    275359.000000
mean         63.517557
std          69.966379
min           1.000000
25%          26.000000
50%          50.000000
75%          82.000000
max        5260.000000
Name: text, dtype: float64


In [190]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [191]:
print(X_train_seq_trunc[10])  # Example of padded sequence
print(X_train_seq_trunc.shape)
print(X_test_seq_trunc.shape)
print(y_train.shape)
print(y_test.shape)

[  0   0   0 ...  61  14 316]
(192544, 5260)
(82815, 5260)
(192544,)
(82815,)


In [192]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train, test_size=0.1, random_state=37)

assert X_valid_emb.shape[0] == y_valid_emb.shape[0]
assert X_train_emb.shape[0] == y_train_emb.shape[0]

print('Shape of validation set:',X_valid_emb.shape)

Shape of validation set: (19255, 5260)


In [193]:
# This read the embeddings
# glove_file = 'glove.42B.' + str(GLOVE_DIM) + 'd.txt'
# emb_dict = {}
# glove = open(glove_file)
# for line in glove:
#     values = line.split()
#     word = values[0]
#     vector = np.asarray(values[1:], dtype='float32')
#     emb_dict[word] = vector
# glove.close()

In [194]:
airline_words = ['car', 'nice', 'flight', 'luggage']
for w in airline_words:
    if w in emb_dict.keys():
        print('Found the word {} in the dictionary'.format(w))
# print(emb_dict)

Found the word car in the dictionary
Found the word nice in the dictionary
Found the word flight in the dictionary
Found the word luggage in the dictionary


In [195]:
# Here we build a matrix that represent words and it corresponding emdg
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [196]:
glove_model = Sequential()
glove_model.add(Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
# glove_model.add(LSTM(GLOVE_DIM, return_sequences=True))
# glove_model.add(LSTM(GLOVE_DIM, return_sequences=True))
glove_model.add(Flatten())
glove_model.add(Dense(1, activation='sigmoid'))
glove_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 5260, 300)         14934300  
_________________________________________________________________
flatten_7 (Flatten)          (None, 1578000)           0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 1578001   
Total params: 16,512,301
Trainable params: 16,512,301
Non-trainable params: 0
_________________________________________________________________


In [197]:
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False

glove_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [198]:
history = glove_model.fit(X_train_emb
                       , y_train_emb
                       , epochs=1
                       , batch_size=32
                       , validation_data=(X_valid_emb, y_valid_emb)
                       , verbose=1)

Train on 173289 samples, validate on 19255 samples
Epoch 1/1


In [199]:
print(glove_model.predict(X_train_emb[0:1]))
print(X_train_emb[0].shape)

[[0.9998597]]
(5260,)


# Adversarial Neural Network

In this section we start building the GANs, this model takes the word embedding and generate new embeddings that are similar to the given ones. 

In [128]:
def build_generator(shape):
    img_shape = shape
    noise_shape = (100,)

    model = Sequential()

    model.add(Dense(256, input_shape=noise_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(img_shape), activation='tanh'))
    model.add(Reshape(img_shape))

    model.summary()

    noise = Input(shape=noise_shape)
    img = model(noise)

    return Model(noise, img)

def build_discriminator(shape):

    img_shape = shape

    model = Sequential()

#     model.add(Flatten(input_shape=img_shape)) # is one dimension
    model.add(Dense(512, input_shape=img_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()

    img = Input(shape=img_shape)
    validity = model(img)

    return Model(img, validity)


In [253]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
def results(pred, actual):
    results = confusion_matrix(actual, pred)
    print('Confusion Matrix :')
    print(results)
    print ('Accuracy Score :',accuracy_score(actual, pred))
    print ('Report : ')
    print(classification_report(actual, pred))
    print()

In [132]:
img_rows = 1
img_cols = X_train_emb[0].shape
img_shape = (img_cols)

optimizer = Adam(0.0002, 0.5)

# Build and compile the discriminator
discriminator = build_discriminator(img_shape)
discriminator.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

# Build and compile the generator
generator = build_generator(img_shape)
generator.compile(loss='binary_crossentropy', optimizer=optimizer)

# The generator takes noise as input and generated imgs
z = Input(shape=(100,))
img = generator(z)

# For the combined model we will only train the generator
discriminator.trainable = False

# The valid takes generated images as input and determines validity
valid = discriminator(img)

# The combined model  (stacked generator and discriminator) takes
# noise as input => generates images => determines validity
combined = Model(z, valid)
combined.compile(loss='binary_crossentropy', optimizer=optimizer)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 512)               2693632   
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               131328    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 2,825,217
Trainable params: 2,825,217
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   


In [214]:
def train(epochs, data, batch_size=128):


        # Rescale -1 to 1
        X_train = data #(X_train.astype(np.float32) - 127.5) / 127.5
#         X_train = np.expand_dims(X_train, axis=3)

        half_batch = int(batch_size / 2)

        for epoch in range(epochs):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random half batch of images
            idx = np.random.randint(0, X_train.shape[1], half_batch)
            imgs = X_train[idx]

            noise = np.random.normal(0, 1, (half_batch, 100))

            # Generate a half batch of new images
            gen_imgs = np.round(generator.predict(noise))

            # Train the discriminator
            d_loss_real = discriminator.train_on_batch(imgs, np.ones((half_batch, 1)))
            d_loss_fake = discriminator.train_on_batch(gen_imgs, np.zeros((half_batch, 1)))
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)


            # ---------------------
            #  Train Generator
            # ---------------------

            noise = np.random.normal(0, 1, (batch_size, 100))

            # The generator wants the discriminator to label the generated samples
            # as valid (ones)
            valid_y = np.array([1] * batch_size)

            # Train the generator
            g_loss = combined.train_on_batch(noise, valid_y)

            # Plot the progress
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))



In [215]:
train(epochs=1000, data=X_test_seq_trunc) # Generate data similar to the second dataset

  'Discrepancy between trainable weights and collected trainable'


0 [D loss: 0.477784, acc.: 82.81%] [G loss: 2.574787]
1 [D loss: 0.678478, acc.: 80.47%] [G loss: 3.574183]
2 [D loss: 0.463642, acc.: 82.03%] [G loss: 3.972436]
3 [D loss: 0.130374, acc.: 94.53%] [G loss: 4.129066]
4 [D loss: 0.125625, acc.: 96.09%] [G loss: 3.625206]
5 [D loss: 0.077234, acc.: 99.22%] [G loss: 3.083598]
6 [D loss: 0.072303, acc.: 98.44%] [G loss: 2.182201]
7 [D loss: 0.192370, acc.: 91.41%] [G loss: 2.053828]
8 [D loss: 0.291957, acc.: 84.38%] [G loss: 2.562545]
9 [D loss: 0.878608, acc.: 73.44%] [G loss: 3.637248]
10 [D loss: 0.954615, acc.: 78.91%] [G loss: 4.576241]
11 [D loss: 0.277697, acc.: 92.19%] [G loss: 5.083704]
12 [D loss: 0.178564, acc.: 92.19%] [G loss: 4.452594]
13 [D loss: 0.032658, acc.: 100.00%] [G loss: 3.631996]
14 [D loss: 0.082488, acc.: 97.66%] [G loss: 2.667324]
15 [D loss: 0.399816, acc.: 93.75%] [G loss: 2.266572]
16 [D loss: 0.146342, acc.: 95.31%] [G loss: 2.305400]
17 [D loss: 0.261202, acc.: 93.75%] [G loss: 2.228607]
18 [D loss: 0.17396

149 [D loss: 0.304097, acc.: 89.84%] [G loss: 2.373237]
150 [D loss: 0.545925, acc.: 90.62%] [G loss: 2.103318]
151 [D loss: 0.261496, acc.: 97.66%] [G loss: 1.915032]
152 [D loss: 0.624142, acc.: 83.59%] [G loss: 2.041608]
153 [D loss: 0.307638, acc.: 91.41%] [G loss: 2.048768]
154 [D loss: 0.175036, acc.: 91.41%] [G loss: 2.168442]
155 [D loss: 0.119751, acc.: 96.09%] [G loss: 1.873957]
156 [D loss: 0.184084, acc.: 93.75%] [G loss: 1.900365]
157 [D loss: 0.505717, acc.: 86.72%] [G loss: 2.445364]
158 [D loss: 0.114467, acc.: 96.88%] [G loss: 2.246426]
159 [D loss: 0.196836, acc.: 92.97%] [G loss: 2.550131]
160 [D loss: 0.109289, acc.: 96.88%] [G loss: 2.593992]
161 [D loss: 0.122813, acc.: 95.31%] [G loss: 2.254257]
162 [D loss: 0.237607, acc.: 90.62%] [G loss: 2.534153]
163 [D loss: 0.423726, acc.: 92.19%] [G loss: 2.542685]
164 [D loss: 0.285438, acc.: 82.03%] [G loss: 2.851692]
165 [D loss: 0.535031, acc.: 72.66%] [G loss: 4.077628]
166 [D loss: 0.898252, acc.: 77.34%] [G loss: 4.

296 [D loss: 0.233318, acc.: 90.62%] [G loss: 2.696869]
297 [D loss: 0.488544, acc.: 80.47%] [G loss: 3.029508]
298 [D loss: 0.304855, acc.: 90.62%] [G loss: 3.038410]
299 [D loss: 0.243255, acc.: 89.84%] [G loss: 3.007699]
300 [D loss: 0.298765, acc.: 82.81%] [G loss: 2.413016]
301 [D loss: 0.266468, acc.: 85.16%] [G loss: 2.483364]
302 [D loss: 0.619581, acc.: 85.16%] [G loss: 2.549067]
303 [D loss: 0.148164, acc.: 93.75%] [G loss: 2.310328]
304 [D loss: 0.679864, acc.: 77.34%] [G loss: 3.085741]
305 [D loss: 0.259553, acc.: 88.28%] [G loss: 3.411243]
306 [D loss: 1.010815, acc.: 75.78%] [G loss: 3.603214]
307 [D loss: 0.560024, acc.: 82.81%] [G loss: 3.783799]
308 [D loss: 0.250909, acc.: 94.53%] [G loss: 3.459198]
309 [D loss: 0.683203, acc.: 82.03%] [G loss: 1.729077]
310 [D loss: 0.543766, acc.: 79.69%] [G loss: 2.459729]
311 [D loss: 0.280957, acc.: 94.53%] [G loss: 2.777164]
312 [D loss: 0.346759, acc.: 87.50%] [G loss: 2.938316]
313 [D loss: 0.520501, acc.: 82.03%] [G loss: 3.

443 [D loss: 0.309423, acc.: 98.44%] [G loss: 2.699585]
444 [D loss: 0.174925, acc.: 99.22%] [G loss: 2.549950]
445 [D loss: 0.292203, acc.: 98.44%] [G loss: 2.584862]
446 [D loss: 0.178544, acc.: 99.22%] [G loss: 2.627130]
447 [D loss: 0.294543, acc.: 98.44%] [G loss: 2.443122]
448 [D loss: 0.313947, acc.: 98.44%] [G loss: 2.455536]
449 [D loss: 0.308894, acc.: 96.88%] [G loss: 2.320000]
450 [D loss: 0.336272, acc.: 97.66%] [G loss: 2.507941]
451 [D loss: 0.294615, acc.: 98.44%] [G loss: 2.702210]
452 [D loss: 0.152754, acc.: 99.22%] [G loss: 2.542645]
453 [D loss: 0.414723, acc.: 97.66%] [G loss: 2.784241]
454 [D loss: 0.413032, acc.: 97.66%] [G loss: 3.149680]
455 [D loss: 0.797377, acc.: 95.31%] [G loss: 3.220088]
456 [D loss: 0.558517, acc.: 96.09%] [G loss: 2.643678]
457 [D loss: 0.635619, acc.: 93.75%] [G loss: 1.984150]
458 [D loss: 0.338425, acc.: 98.44%] [G loss: 2.428814]
459 [D loss: 0.305279, acc.: 97.66%] [G loss: 2.671984]
460 [D loss: 0.434089, acc.: 96.88%] [G loss: 2.

589 [D loss: 0.172099, acc.: 99.22%] [G loss: 2.468400]
590 [D loss: 0.292647, acc.: 97.66%] [G loss: 2.381739]
591 [D loss: 0.170002, acc.: 99.22%] [G loss: 2.214268]
592 [D loss: 0.305724, acc.: 98.44%] [G loss: 2.410449]
593 [D loss: 0.408136, acc.: 97.66%] [G loss: 2.667385]
594 [D loss: 0.157453, acc.: 99.22%] [G loss: 2.777940]
595 [D loss: 0.420683, acc.: 97.66%] [G loss: 2.644997]
596 [D loss: 0.225868, acc.: 98.44%] [G loss: 2.116907]
597 [D loss: 0.082047, acc.: 100.00%] [G loss: 2.089635]
598 [D loss: 0.312327, acc.: 97.66%] [G loss: 2.126152]
599 [D loss: 0.329679, acc.: 96.09%] [G loss: 2.193193]
600 [D loss: 0.132760, acc.: 96.09%] [G loss: 2.693415]
601 [D loss: 0.857749, acc.: 78.12%] [G loss: 3.025905]
602 [D loss: 1.875918, acc.: 77.34%] [G loss: 3.435950]
603 [D loss: 0.766726, acc.: 81.25%] [G loss: 5.240262]
604 [D loss: 0.555618, acc.: 82.81%] [G loss: 5.927012]
605 [D loss: 0.116163, acc.: 94.53%] [G loss: 5.305459]
606 [D loss: 0.136158, acc.: 92.97%] [G loss: 4

736 [D loss: 0.294067, acc.: 98.44%] [G loss: 2.602633]
737 [D loss: 0.179680, acc.: 99.22%] [G loss: 2.659523]
738 [D loss: 0.415332, acc.: 97.66%] [G loss: 2.734788]
739 [D loss: 0.437124, acc.: 96.09%] [G loss: 3.012193]
740 [D loss: 0.702173, acc.: 85.16%] [G loss: 3.789816]
741 [D loss: 0.656577, acc.: 82.81%] [G loss: 4.797854]
742 [D loss: 0.023750, acc.: 100.00%] [G loss: 5.060510]
743 [D loss: 0.577953, acc.: 96.88%] [G loss: 5.082690]
744 [D loss: 0.274759, acc.: 98.44%] [G loss: 5.152535]
745 [D loss: 0.288162, acc.: 98.44%] [G loss: 4.205429]
746 [D loss: 0.519040, acc.: 96.88%] [G loss: 3.807043]
747 [D loss: 0.271566, acc.: 98.44%] [G loss: 3.268416]
748 [D loss: 0.407209, acc.: 97.66%] [G loss: 3.029082]
749 [D loss: 0.034847, acc.: 100.00%] [G loss: 2.994687]
750 [D loss: 0.282380, acc.: 98.44%] [G loss: 2.781832]
751 [D loss: 0.035643, acc.: 100.00%] [G loss: 3.018216]
752 [D loss: 0.549118, acc.: 96.88%] [G loss: 2.949282]
753 [D loss: 0.193419, acc.: 96.88%] [G loss:

882 [D loss: 0.144013, acc.: 92.97%] [G loss: 3.897709]
883 [D loss: 0.569002, acc.: 94.53%] [G loss: 3.971955]
884 [D loss: 0.509371, acc.: 96.88%] [G loss: 3.678460]
885 [D loss: 0.319913, acc.: 97.66%] [G loss: 3.780095]
886 [D loss: 0.046251, acc.: 100.00%] [G loss: 3.431382]
887 [D loss: 0.558218, acc.: 96.88%] [G loss: 3.210096]
888 [D loss: 0.339138, acc.: 96.88%] [G loss: 3.110608]
889 [D loss: 0.436654, acc.: 97.66%] [G loss: 3.294143]
890 [D loss: 0.286949, acc.: 98.44%] [G loss: 3.439409]
891 [D loss: 0.706659, acc.: 93.75%] [G loss: 3.361472]
892 [D loss: 0.204999, acc.: 99.22%] [G loss: 3.790366]
893 [D loss: 0.665570, acc.: 95.31%] [G loss: 3.612642]
894 [D loss: 0.205533, acc.: 95.31%] [G loss: 3.216388]
895 [D loss: 1.241030, acc.: 71.09%] [G loss: 3.550195]
896 [D loss: 1.048190, acc.: 81.25%] [G loss: 5.485571]
897 [D loss: 0.729389, acc.: 85.94%] [G loss: 6.251943]
898 [D loss: 0.037449, acc.: 100.00%] [G loss: 5.639144]
899 [D loss: 0.191133, acc.: 99.22%] [G loss: 

# Model Evaluation

Second model trained on the real data

In [160]:
print(X_test_seq_trunc.shape, y_test.shape)

(82815, 5260) (82815,)


In [281]:
X_train_emb2, X_valid_emb2, y_train_emb2, y_valid_emb2 = train_test_split(X_test_seq_trunc, y_test, test_size=0.1, random_state=37)

assert X_valid_emb2.shape[0] == y_valid_emb2.shape[0]
assert X_train_emb2.shape[0] == y_train_emb2.shape[0]

print('Shape of validation set:',X_valid_emb2.shape)

Shape of validation set: (8282, 5260)


In [162]:
glove_model2 = Sequential()
glove_model2.add(Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
# glove_model2.add(LSTM(GLOVE_DIM, return_sequences=True))
# glove_model2.add(LSTM(GLOVE_DIM, return_sequences=True))
glove_model2.add(Flatten())
glove_model2.add(Dense(1, activation='sigmoid'))
glove_model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 5260, 300)         14934300  
_________________________________________________________________
flatten_4 (Flatten)          (None, 1578000)           0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 1578001   
Total params: 16,512,301
Trainable params: 16,512,301
Non-trainable params: 0
_________________________________________________________________


In [165]:
glove_model2.layers[0].set_weights([emb_matrix])
glove_model2.layers[0].trainable = False

glove_model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [166]:
history2 = glove_model2.fit(X_train_emb2
                       , y_train_emb2
                       , epochs=1
                       , batch_size=32
                       , validation_data=(X_valid_emb2, y_valid_emb2)
                       , verbose=1)

Train on 74533 samples, validate on 8282 samples
Epoch 1/1


In [168]:
# print(glove_model2.predict(X_train_emb[0:1]))
# print(X_train_emb[0].shape)

[[0.9684406]]
(5260,)


Third model trained on a generated dataset

In [297]:
gen = 10000
noise = np.random.normal(0, 1, (gen, 100))
gen_samp = np.absolute((generator.predict(noise)))

In [298]:
prediction = glove_model.predict((gen_samp))

In [299]:
prediction = np.round(prediction)
# print(np.round(prediction[0:100]))
print(np.sum(np.round(glove_model.predict(X_train_emb[0:100]))))
print(np.sum(y_train[0:100]))


98.0
99.0


In [300]:
X_train_emb3, X_valid_emb3, y_train_emb3, y_valid_emb3 = train_test_split(gen_samp, prediction, test_size=0.3, random_state=37)

assert X_valid_emb3.shape[0] == y_valid_emb3.shape[0]
assert X_train_emb3.shape[0] == y_train_emb3.shape[0]

print('Shape of validation set:',X_valid_emb3.shape)

Shape of validation set: (3000, 5260)


In [301]:
glove_model3 = Sequential()
glove_model3.add(Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
# glove_model3.add(LSTM(GLOVE_DIM, return_sequences=True))
# glove_model3.add(LSTM(GLOVE_DIM, return_sequences=True))
glove_model3.add(Flatten())
glove_model3.add(Dense(1, activation='sigmoid'))
glove_model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 5260, 300)         14934300  
_________________________________________________________________
flatten_14 (Flatten)         (None, 1578000)           0         
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 1578001   
Total params: 16,512,301
Trainable params: 16,512,301
Non-trainable params: 0
_________________________________________________________________


In [302]:
glove_model3.layers[0].set_weights([emb_matrix])
glove_model3.layers[0].trainable = False

glove_model3.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [303]:
history3 = glove_model3.fit(X_train_emb3
                       , y_train_emb3
                       , epochs=1
                       , batch_size=32
                       , validation_data=(X_valid_emb3, y_valid_emb3)
                       , verbose=1)

Train on 7000 samples, validate on 3000 samples
Epoch 1/1


In this area I am going to compare their results

In [311]:
# The model trained on the generated dataset over the real dataset
actual = y_valid_emb3
pred = np.round(glove_model2.predict(X_valid_emb3))
pred2 = np.round(glove_model3.predict(X_valid_emb3))

In [312]:
results(pred, actual) # trained on original over real
results(pred2, actual) # generated

Confusion Matrix :
[[  57    1]
 [   3 2939]]
Accuracy Score : 0.9986666666666667
Report : 
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.97        58
         1.0       1.00      1.00      1.00      2942

   micro avg       1.00      1.00      1.00      3000
   macro avg       0.97      0.99      0.98      3000
weighted avg       1.00      1.00      1.00      3000


Confusion Matrix :
[[  55    3]
 [  10 2932]]
Accuracy Score : 0.9956666666666667
Report : 
              precision    recall  f1-score   support

         0.0       0.85      0.95      0.89        58
         1.0       1.00      1.00      1.00      2942

   micro avg       1.00      1.00      1.00      3000
   macro avg       0.92      0.97      0.95      3000
weighted avg       1.00      1.00      1.00      3000




In [309]:
# The model trained on the generated dataset over the real dataset
actual = y_valid_emb2
pred = np.round(glove_model2.predict(X_valid_emb2))
pred2 = np.round(glove_model3.predict(X_valid_emb2))

In [310]:
results(pred, actual) # trained on original over real
results(pred2, actual) # generated

Confusion Matrix :
[[1246 1462]
 [ 170 5404]]
Accuracy Score : 0.802946148273364
Report : 
              precision    recall  f1-score   support

         0.0       0.88      0.46      0.60      2708
         1.0       0.79      0.97      0.87      5574

   micro avg       0.80      0.80      0.80      8282
   macro avg       0.83      0.71      0.74      8282
weighted avg       0.82      0.80      0.78      8282


Confusion Matrix :
[[  53 2655]
 [  83 5491]]
Accuracy Score : 0.6694035257184255
Report : 
              precision    recall  f1-score   support

         0.0       0.39      0.02      0.04      2708
         1.0       0.67      0.99      0.80      5574

   micro avg       0.67      0.67      0.67      8282
   macro avg       0.53      0.50      0.42      8282
weighted avg       0.58      0.67      0.55      8282


