In [0]:
!wget -q https://l1nna.com/372/Assignment/A2-3/train.csv
!wget -q https://l1nna.com/372/Assignment/A2-3/test.csv

In [6]:
# import libraries
import pandas as pd
import csv
import re

# read train data from file and save in variable
xy_train_df = pd.read_csv('train.csv')

# read test data from file and save in variable; use id as row label
x_test_df  = pd.read_csv('test.csv', index_col='id')

# calculate length of review description (how many characters) and sort the table by the length(ascending)
xy_train_df['length'] = xy_train_df.apply(lambda x: len(x.review), axis=1)
#filtering text
xy_train_df['review'] = xy_train_df['review'].apply(lambda x: x.lower())
xy_train_df['review'] = xy_train_df['review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

xy_train_df = xy_train_df.sort_values('length')
xy_train_df

Unnamed: 0,id,rating,review,length
6037,2596,1,five stars_good,15
5353,4643,1,love it_love it,15
2545,8791,1,five stars_good,15
3902,6098,1,five stars_love,16
2850,4609,1,love these_so cute,19
...,...,...,...,...
5651,518,1,so far its awesome_ok so ill say up front ive ...,5765
1615,124,1,it works read tips for potential effectiveness...,6740
5046,7257,1,an exquisitely effective product with an astou...,8082
4859,7555,1,gorgeous professional looking manicure at home...,8134


# New Section

In [7]:
#import libraries
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

#define vocabulary size and max_len
vocab_size = 1000
max_len = 256

#split training set into training and validation set; 80% training, 20% validation
xy_train, xy_validation = train_test_split(xy_train_df, test_size=0.2)

# print length to be sure about split
print(len(xy_train))
print(len(xy_validation))

# build vocabulary from training set
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(xy_train.review)

# padding is done inside; convert review text to numpy matrix for max length 256 for training, validation and testing set
x_train = tokenizer.texts_to_matrix(xy_train.review, mode='binary')[:, :max_len]
y_train = xy_train.rating

x_valid = tokenizer.texts_to_matrix(xy_validation.review, mode='binary')[:, :max_len]
y_valid = xy_validation.rating

x_test = tokenizer.texts_to_matrix(x_test_df.review, mode='binary')[:, :max_len]

print(x_train.shape[1])
print(x_valid.shape)
print(x_test.shape)

4978
1245
256
(1245, 256)
(2667, 256)


In [22]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

embedding_dim = 100

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(keras.layers.Conv1D(128, 5, activation='relu'))
model.add(keras.layers.GlobalMaxPooling1D())
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

# compile the model with "binary_crossentropy" as loss function
# set the metrics to accuracy
# Use the Adam optimizer
model.compile(
    optimizer=Adam(clipnorm=4.),
    loss='binary_crossentropy',
    metrics=['accuracy'])


history = model.fit(x_train,
                    y_train,
                    epochs=10,
                    batch_size=100,
                    validation_data=(x_valid, y_valid),
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
# evaluate model
model.evaluate(x_valid, y_valid)



[0.36454322934150696, 0.8811244964599609]

In [24]:
y_predict = np.squeeze(model.predict_classes(x_valid))

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
0.8811244979919679


In [0]:
# run on testing set:
y_predict = np.squeeze(model.predict_classes(x_test))

pd.DataFrame(
    {'id': x_test_df.index,
     'rating':y_predict}).to_csv('sample_submission.csv', index=False)