In [1]:
#####################
# Model: NLP + ANN
#####################

import numpy as np
import pandas as pd
import os
import re
import math

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
# from tensorflow import keras
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, LSTM,Input, Dropout, Embedding
from tensorflow.keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

pd.options.display.max_colwidth = 100

SEED = 42
TEST_SIZE = 0.2

callback = EarlyStopping(monitor='loss', patience=3)

p_data_raw       = os.path.abspath('../data/raw') # raw data path
p_data_processed = os.path.abspath('../data/processed') # processed data path
p_data_cleaned   = os.path.abspath('../data/cleaned') # clearned data path
p_data_src       = os.path.abspath('../data/src') # src data path

k_ratings_f = pd.read_csv(os.path.abspath(p_data_processed+'/k_ratings_f.csv'), dtype={'label_c': object})

In [38]:
# # get rid of "Other" genre
# k_ratings_f = (k_ratings_f[k_ratings_f['genre1']!='Other'])
# vote_count=k_ratings_f.groupby('movieId')['userId'].count().sort_values(ascending=False)
# # get rid of movies with <10 ratings
# k_ratings_f = k_ratings_f[k_ratings_f.movieId.isin(vote_count[vote_count>=10].index.to_list())]  # 245 movies
# save data
# k_ratings_f.to_csv(os.path.abspath(os.path.abspath(p_data_processed+'/k_ratings_f.csv')),index=False,encoding='utf-8-sig')

In [11]:
# # MovieLens datasets
# ml_links   = pd.read_csv(p_data_raw+'/ml-25m/links.csv') # movie links
# ml_ratings = pd.read_csv(p_data_raw+'/ml-25m/ratings.csv') # user ratings
# ml_movies  = pd.read_csv(p_data_raw+'/ml-25m/movies.csv') # movie list

# # IMDB Kmovie list
# imdb_mat_orig = pd.read_pickle(p_data_raw+'/imdb_kr.pickle')

# # IMDB Kmovie list join MovieLens
# imdb_mat = pd.merge(imdb_mat_orig,ml_links,on='imdbId')

# # IMDB Kmovie list join MovieLens movie list
# imdb_mat = pd.merge(imdb_mat,ml_movies,on='movieId')

# imdb_mat_f = imdb_mat[imdb_mat.imdb_id.isin(k_ratings_f.imdb_id.to_list())]
# imdb_mat_f['genre1'] = imdb_mat_f['genres'].apply(lambda s:s.split('|')[0])
# imdb_mat_f['year'] = imdb_mat_f['title'].apply(lambda s:s.split()[-1].replace('(','').replace(')',''))
# imdb_mat_f.to_csv(os.path.abspath(p_data_processed+'/imdb_mat_f.csv'),index=False,encoding='utf-8-sig')

In [3]:
# prepare X
X = pd.concat([k_ratings_f.drop(['userId','movieId','rating_x','timestamp','name','year','rating_y',
                                 # 'metascore',
                                 'imdb_id','poster','genre',
                                 'certificate','runtime','director_actor', 
                                 # 'story', 
                                 'imdbId','tmdbId',
                                 'title','genres','genre1',
                                 'label','label_c',
                                 'y_5','y',
                                 # 'timestamp','vote',
                                 'director','director1','director2','actor1'
                                 ], axis = 1),
               pd.get_dummies(k_ratings_f[[
                   #'certificate',
                   #'genre1',
                   'label_c','director1','actor1']], 
                              drop_first = True, dummy_na = True)],axis = 1)

# fill metascore null values with mean
X['metascore'].fillna(X['metascore'].mean(), inplace=True)
X.fillna(0,inplace=True)

In [4]:
# prepare y
# y = k_ratings_f['y']
y = k_ratings_f['y_5']
# y = k_ratings_f['y'].cat.codes

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode target labels
y = label_encoder.fit_transform(y)
y = to_categorical(y)

In [5]:
# prepare X1 for NLP
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)            
    # Removing "See full summary »"
    sentence = sentence.replace('See full summary', ' ')
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [6]:
X1 = []
sentences = list(X["story"])
for sen in sentences:
    X1.append(preprocess_text(sen))
    
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X1)

X1 = tokenizer.texts_to_sequences(X1)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 50 # 200

X1 = pad_sequences(X1, padding='post', maxlen=maxlen)

# data scale
# scaler1 = MinMaxScaler()
# scaler1 = StandardScaler()
# X1 = scaler1.fit_transform(X1)

# embedding
embeddings_dictionary = dict()

glove_file_link = p_data_src+'/glove.6B/glove.6B.50d.txt' # glove file link

with open(glove_file_link, 'r') as glove_file:
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

glove_file.close()

embedding_matrix = np.zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [7]:
# prepare X2 for ANN
X2 = X.drop('story', axis=1).copy().values

# data scale
# scaler2 = MinMaxScaler()
scaler2 = StandardScaler()
X2 = scaler2.fit_transform(X2)

In [8]:
################
# model
################

input_1 = Input(shape=(maxlen,))
input_2 = Input(shape=(X.shape[1]-1,))

embedding_layer = Embedding(vocab_size, 50, weights=[embedding_matrix], trainable=False)(input_1)
LSTM_Layer_1 = LSTM(128)(embedding_layer)

def create_model():
    
    tf.keras.backend.clear_session()
    
    dense_layer_1 = Dense(972, activation='relu')(input_2)
    dense_layer_2 = Dense(1944, activation='relu')(dense_layer_1)
    dense_layer_3 = Dense(972, activation='relu')(dense_layer_2)
    dense_layer_4 = Dense(486, activation='relu')(dense_layer_3)
    dense_layer_5 = Dense(243, activation='relu')(dense_layer_4)    
    
    concat_layer = Concatenate()([LSTM_Layer_1, dense_layer_5])
    dense_layer_6 = Dense(10, activation='relu')(concat_layer)
    output = Dense(5, activation='softmax')(dense_layer_6)
    # output = Dense(3, activation='softmax')(dense_layer_6)
    
    model = Model(inputs=[input_1, input_2], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    #plot_model(model, to_file='figures/model_plot_final.png', show_shapes=True, show_layer_names=True)

    return model

model=create_model()
# history = model.fit(x=[X1_train, X2_train], y=y_train, validation_data=([X1_test, X2_test],y_test),
#                     callbacks=[callback],
#                     batch_size=128, epochs=100, verbose=1)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 385)]        0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 972)          375192      input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1944)         1891512     dense[0][0]                      
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 50)]         0                                            
______________________________________________________________________________________________

In [9]:
model.fit(x=[X1, X2], y=y, epochs=10)

Train on 31378 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x146e072b0>

In [10]:
# save model
import joblib
from tensorflow.keras.models import load_model

model.save("model_final.h5")
#joblib.dump(scaler1,'scaler1_final.pkl')
joblib.dump(scaler2,'scaler2_final.pkl')

['scaler2_final.pkl']

In [12]:
# load model
import joblib
from tensorflow.keras.models import load_model

ratings_model = load_model('model_final.h5')
#ratings_scaler1 = joblib.load('scaler1_final.pkl')
ratings_scaler2 = joblib.load('scaler2_final.pkl')

In [13]:
#####################################
# prepare X for megamat
#####################################
reviewers_mat = k_ratings_f[['userId','label','label_c']].copy()
movies_mat = k_ratings_f.drop(['userId','rating_x','timestamp','label','label_c','y','y_5'], axis=1).copy()
# movies_mat = k_ratings_f.drop(['userId','rating_x','timestamp','label','label_c','story'], axis=1).copy()
# storys_mat = k_ratings_f[['movieId','story']].copy()

reviewers_mat = reviewers_mat.drop_duplicates(subset='userId')
movies_mat = movies_mat.drop_duplicates(subset='movieId')
# storys_mat = storys_mat.drop_duplicates(subset='movieId')

print('# reviewers: {}'.format(len(reviewers_mat)))
print('# movies: {}'.format(len(movies_mat)))

# create megamat
reviewers_mat['joincol'] = 1
movies_mat['joincol'] = 1
megamat = pd.merge(reviewers_mat,movies_mat,how='outer',on='joincol')
print('# movies x reviewers: {}'.format(len(megamat)))

# reviewers: 9765
# movies: 245
# movies x reviewers: 2392425


In [14]:
# exclude ratings with real "y"
k_ratings_f['joinId'] = k_ratings_f['userId'].astype(str) + '_' + k_ratings_f['movieId'].astype(str)
# take long time!
megamat['joinId'] = megamat['userId'].astype(str) + '_' + megamat['movieId'].astype(str)
megamat = megamat[~megamat.joinId.isin(k_ratings_f['joinId'].tolist())]
print('# movies x reviewers exclude real reviews: {}'.format(len(megamat)))

# movies x reviewers exclude real reviews: 2361047


In [15]:
X_final = pd.concat([megamat.drop(['joincol','joinId','userId','movieId','name','year','rating_y',
                                 # 'metascore',
                                 'imdb_id','poster','genre',
                                 'certificate','runtime','director_actor', 
                                 # 'story', 
                                 'imdbId','tmdbId',
                                 'title','genres','genre1',
                                 'label','label_c',
                                 # 'timestamp','vote',
                                 'director','director1','director2','actor1'
                                 ], axis = 1),
               pd.get_dummies(megamat[[
                   #'certificate',
                   #'genre1',
                   'label_c','director1','actor1']], 
                              drop_first = True, dummy_na = True)],axis = 1)

# fill metascore null values with mean
X_final['metascore'].fillna(X_final['metascore'].mean(), inplace=True)
X_final.fillna(0,inplace=True)

In [16]:
# X1_final
X1_final = []
sentences = list(X_final["story"])
for sen in sentences:
    X1_final.append(preprocess_text(sen))
    
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X1_final)

X1_final = tokenizer.texts_to_sequences(X1_final)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 50 # 200

X1_final = pad_sequences(X1_final, padding='post', maxlen=maxlen)

# data scale
# scaler1 = MinMaxScaler()
# scaler1 = StandardScaler()
# X1_final = scaler1.fit_transform(X1_final)

# embedding
embeddings_dictionary = dict()

glove_file_link = p_data_src+'/glove.6B/glove.6B.50d.txt' # glove file link

with open(glove_file_link, 'r') as glove_file:
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

glove_file.close()

embedding_matrix = np.zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [17]:
# X2_final
X2_final = X_final.drop('story', axis=1).copy().values

# data scale
X2_final = ratings_scaler2.fit_transform(X2_final)

In [37]:
# def return_prediction(model,scaler1,scaler2,sample_json):
# def return_prediction(model, sample_json):
#     story = [[]]
#     # story = scaler1_final.transform(story)
#     reviewer = [[]]
#     # reviewer = scaler2_final.transform(reviewer)
#     rating_ind = model.predict_classes([story, reviewer])

# rating_ind = ratings_model.predict([X1_final, X2_final])

cutoff = 1000
rating_ind = ratings_model.predict([X1_final[:cutoff], X2_final[:cutoff]])

predictions = np.argmax(rating_ind,axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4,
       4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 3,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,