In [57]:
import pandas as pd
import numpy as np
from collections import deque
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model
from sklearn.metrics import mean_squared_error

We have borrowed the data cleaning process from https://www.kaggle.com/morrisb/how-to-recommend-anything-deep-recommender   
We will use this data cleaning process to build collaboratove-filltering baseline models for future evaluation.   
Consulted the following paper for hyperparameter tuning for ALS:   
https://github.com/tolleiv/thesis/blob/master/Research/Papers/__Zhou08%20-%20Large-scale%20Parallel%20Collaborative%20Filtering%20for%20the%20Netflix%20Prize.pdf   
### Future Plans:  
1. Use PySpark to improve time-performance on large dataset
2. Add more sophisticated models

# Data Cleaning

## 1. Create User-Rating dataframe

In [40]:
df_1 = pd.read_csv('combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
df_2 = pd.read_csv('combined_data_2.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
df_3 = pd.read_csv('combined_data_3.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
df_4 = pd.read_csv('combined_data_4.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])
frames = [df_1,df_2,df_3,df_4]
df_raw = pd.concat(frames)

In [41]:
df_raw = df_raw.reset_index()

In [42]:
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(100480507, 5)


Unnamed: 0,index,User,Rating,Date,Movie
65514544,14473979,2012438,3.0,2004-07-25,12014
63797405,12756840,596550,4.0,2005-08-23,11647
64096263,13055698,2345678,2.0,2004-12-07,11677
10957208,10957208,40757,5.0,2003-01-26,2128
18407714,18407714,2073981,4.0,2003-05-03,3522


## 2. Filter Sparse Movies and Users

In [43]:
# Filter sparse movies
min_movie_ratings = 10000
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filterd = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))

Shape User-Ratings unfiltered:	(100480507, 5)
Shape User-Ratings filtered:	(60546559, 5)


## 3. Create Training and Testing Sets

In [48]:
# Shuffle DataFrame
df_final = df_filterd.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)

In [49]:
# Testingsize
n = 100000

In [50]:
# Split train- & testset
df_train = df_final[:-n]
df_test = df_fianl[-n:]

In [70]:
df_download = df_final.drop('index', axis=1)
df_download = df_download[['User','Movie','Rating']]
df_download.to_csv(r'clean.txt', header=None, index=None, sep=',', mode='a')

In [72]:
df_download.sample(1000000).to_csv(r'sample.txt', header=None, index=None, sep=',', mode='a')

Download into a text file to use in Pyspark. Selected a sample of 1000000 as a test case.

## 4. Transform User-Rating Dataframe to User-Movie Matrix

In [51]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.sample(3)

Shape User-Movie-Matrix:	(150245, 2042)


Movie,8,18,28,30,58,77,83,97,108,111,...,17627,17671,17672,17682,17692,17697,17703,17709,17762,17764
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1376053,,3.0,4.0,3.0,,,,,,,...,4.0,3.0,,,,,,3.0,3.0,4.0
1908407,,,,,,,,,,,...,,,,,,,,,,
2582569,,,,,,,,,,,...,5.0,,,,3.0,,,,,


-------

# Baseline Models

### Matrix Factorization Model using Alternative Least Square in Pyspark

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.ml.recommendation import ALS
from itertools import islice
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row

In [2]:
conf = SparkConf()
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

The code creates an exception if we use the full dataset, because the dataset is too large. Will look for solutions in the future.

In [3]:
data = sc.textFile('sample.txt')
ratings_raw = data.map(lambda l: l.split(','))
ratingsRDD = ratings_raw.map(lambda p: Row(user=int(p[0]), item=int(p[1]),
                                     rating=float(p[2])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(rank = 20, maxIter=15, regParam=0.3, userCol="user", itemCol="item", ratingCol="rating",nonnegative = True,
         coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0409918885082154


In [5]:
sc.stop()

### Matrix Factorization Model Using Keras

In [56]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}


# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

In [58]:
#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With Keras Matrix-Factorization: 0.8783 RMSE
