In [1]:
import pandas as pd
import matplotlib
import tensorflow as tf

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv("./data/Train.csv", sep=";")

In [3]:
dataset['opinion'].str.replace('"', '').str.lower().str.replace('&#039;ve', ' have').str.replace('&#039;s', 'is')

dataset['opinion'] = dataset['opinion'].str.lower()
dataset['opinion'] = dataset['opinion'].str.replace('"', '')
dataset['opinion'] = dataset['opinion'].str.replace(',', '')
dataset['opinion'] = dataset['opinion'].str.replace('.', '')
dataset['opinion'] = dataset['opinion'].str.replace('!', '')
dataset['opinion'] = dataset['opinion'].str.replace(':', '')
dataset['opinion'] = dataset['opinion'].str.replace('&#039;ve', ' have')
dataset['opinion'] = dataset['opinion'].str.replace('&#039;s', ' is')
dataset['opinion'] = dataset['opinion'].str.replace('&#039;t', ' not')
dataset['opinion'] = dataset['opinion'].str.replace('&#039;m', ' am')
dataset['opinion'] = dataset['opinion'].str.replace('&#039;ll', ' will')

dataset = dataset.drop_duplicates('opinion').reset_index()

AttributeError: 'DataFrame' object has no attribute 'ignore_index'

In [7]:
train_df, val_df = train_test_split(dataset, test_size=0.1, random_state=2137)

X_train = train_df.opinion.values
X_val = val_df.opinion.values

# LSTM

In [8]:
from gensim.models import KeyedVectors
import numpy as np

In [9]:
EMBEDDING_FILE = './GoogleNews-vectors-negative300.bin.gz' 
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [10]:
embed_size = 300 # how big is each word vector
max_features = 25000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 75 # max number of words in a question to use

In [12]:
X_train = train_df.opinion.values
X_val = val_df.opinion.values

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = max_features)

In [14]:
tokenizer.fit_on_texts(list(X_train))

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=maxlen)
X_val = pad_sequences(X_val, maxlen=maxlen)

In [16]:
y_train = train_df['rate'].values
y_val = val_df['rate'].values

print(y_train)

[ 8  9  8 ...  6  3 10]


In [17]:
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Dropout, GlobalMaxPool1D, Bidirectional, GRU, Embedding, Input, LSTM, BatchNormalization
from tensorflow.keras.models import Model
import tensorflow as tf
from tensorflow.keras import backend as K

In [47]:
embedding_matrix = np.zeros((max_features, embed_size))
for word, i in tokenizer.word_index.items():
    try:
        embedding_matrix[i, :] = word2vec.get_vector(word)
    except:
        pass
        
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
x = BatchNormalization()(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
# x = GlobalMaxPool1D()(x)
x = Dense(64, activation="relu", kernel_regularizer=regularizers.l2(0.1))(x)
x = Dense(32, activation="tanh", kernel_regularizer=regularizers.l2(0.1))(x)
x = Dropout(0.1)(x)
x = Dense(1, kernel_regularizer=regularizers.l2(0.1))(x)
model = Model(inputs=inp, outputs=x)


adam = tf.keras.optimizers.Adam(lr=1e-3)


model.compile(loss='mse',
              optimizer=adam,
              metrics=['mse'])
print(model.summary())

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 75)]              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 75, 300)           7500000   
_________________________________________________________________
batch_normalization_18 (Batc (None, 75, 300)           1200      
_________________________________________________________________
bidirectional_6 (Bidirection (None, 75, 256)           439296    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
batch_normalization_19 (Batc (None, 256)               1024      
_________________________________________________________________
dense_12 (Dense)             (None, 64)                1644

In [48]:
model.fit(X_train, y_train, epochs = 5, batch_size=64, validation_data=(X_val, y_val))

Train on 96733 samples, validate on 10749 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f83b85ed050>

In [49]:
y_hat = model.predict(X_val)

In [50]:
y_hat

array([[8.4701395],
       [9.48248  ],
       [8.765604 ],
       ...,
       [8.560794 ],
       [9.01858  ],
       [8.604201 ]], dtype=float32)

In [51]:
y_hat_cat = np.clip(np.rint(y_hat.flatten()), 1, 10).astype(int)

In [52]:
y_hat_cat

array([8, 9, 9, ..., 9, 9, 9])

In [53]:
from sklearn.metrics import mean_squared_error 

In [54]:
mean_squared_error(y_hat_cat, y_val)

4.771792724904643