In [0]:
!wget -q https://l1nna.com/372/Assignment/A2-3/train.csv
!wget -q https://l1nna.com/372/Assignment/A2-3/test.csv

In [4]:
# import libraries
import pandas as pd
import csv

# read train data from file and save in variable
xy_train_df = pd.read_csv('train.csv')

# read test data from file and save in variable; use id as row label
x_test_df  = pd.read_csv('test.csv', index_col='id')

# calculate length of review description (how many characters) and sort the table by the length(ascending)
xy_train_df['length'] = xy_train_df.apply(lambda x: len(x.review), axis=1)
xy_train_df = xy_train_df.sort_values('length')
xy_train_df

Unnamed: 0,id,rating,review,length
6037,2596,1,Five Stars_GOOD,15
5353,4643,1,Love it_Love it,15
2545,8791,1,Five Stars_Good,15
3902,6098,1,Five Stars_love!,16
2850,4609,1,love these_so cute!,19
...,...,...,...,...
5651,518,1,"So far, it's awesome_Ok, so I'll say up front ...",5765
1615,124,1,It Works (Read Tips For Potential Effectivenes...,6740
5046,7257,1,An exquisitely effective product with an astou...,8082
4859,7555,1,Gorgeous professional looking manicure at home...,8134


In [6]:
#import libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

#define vocabulary size and max_len
vocab_size = 10000
max_len = 256

#split training set into training and validation set; 80% training, 20% validation
xy_train, xy_validation = train_test_split(xy_train_df, test_size=0.2)

# print length to be sure about split
print(len(xy_train))
print(len(xy_validation))

# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(xy_train.review)

# padding is done inside; convert review text to numpy matrix for max length 256 for training, validation and testing set (so that network can deal with it)
x_train = tokenizer.texts_to_matrix(xy_train.review, mode='binary')[:, :max_len]
y_train = xy_train.rating

x_valid = tokenizer.texts_to_matrix(xy_validation.review, mode='binary')[:, :max_len]
y_valid = xy_validation.rating

x_test = tokenizer.texts_to_matrix(x_test_df.review, mode='binary')[:, :max_len]

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)

4978
1245
(4978, 256)
(1245, 256)
(2667, 256)


In [8]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

#building model using keras sequential 
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size vocab_size, and
# output embedding dimension of size 20
model.add(keras.layers.Embedding(vocab_size, 20))
# Add a LSTM layer with 128 internal units
model.add(keras.layers.LSTM(100))
# Add a Dense layer with 1 unit
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

# compile the model with "binary_crossentropy" as loss function
# set the metrics to accuracy
# Use the Adam optimizer
model.compile(
    optimizer=Adam(clipnorm=4.),
    loss='binary_crossentropy',
    metrics=['accuracy'])

# train network in 15 epochs with batch_size of 64
history = model.fit(x_train,
                    y_train,
                    epochs=15,
                    batch_size=64,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          200000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               48400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 248,501
Trainable params: 248,501
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [0]:
# evaluate model
model.evaluate(x_valid, y_valid)



[0.4110211133956909, 0.8594377636909485]

In [0]:
y_predict = np.squeeze(model.predict_classes(x_valid))

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
0.8594377510040161


In [0]:
# run on testing set:
y_predict = np.squeeze(model.predict_classes(x_test))

pd.DataFrame(
    {'id': x_test_df.index,
     'rating':y_predict}).to_csv('sample_submission.csv', index=False)