In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import json
import re
import cPickle as pickle
import time
from IPython.display import display
from collections import OrderedDict
from tqdm import tqdm_notebook

import load_problems
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.model_selection import train_test_split
from fastFM.als import FMClassification, FMRegression

%matplotlib inline
sns.set_context('notebook', font_scale=1.6)

# Read in Movielens Dataset

In [2]:
path = '/Users/scottcronin/Dropbox/data/movielens-small/'
fn_ratings = 'ratings.csv'
fn_tags = 'tags.csv'
fn_movies = 'movies.csv'
fn_links = 'links.csv'

format_columns = lambda x: re.sub(r'(?<=[a-z])(?=[A-Z])', '_', x).lower()

ratings = pd.read_csv(path + fn_ratings)
ratings.columns = ratings.columns.map(format_columns)
ratings = ratings.sort_values(['user_id', 'movie_id', 'timestamp'], ascending=[1,1,1])

movies = pd.read_csv(path + fn_movies)
movies.columns = movies.columns.map(format_columns)

tags = pd.read_csv(path + fn_tags)
tags.columns = tags.columns.map(format_columns)

links = pd.read_csv(path + fn_links)
links.columns = links.columns.map(format_columns)

In [3]:
def show_df(df):
    print df.info(null_counts=30)
    display(df.head(3))
    print '------------------\n\n'
    
show_df(ratings)
show_df(movies)
show_df(tags)
show_df(links)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100004 entries, 0 to 100003
Data columns (total 4 columns):
user_id      100004 non-null int64
movie_id     100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.8 MB
None


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
movie_id    9125 non-null int64
title       9125 non-null object
genres      9125 non-null object
dtypes: int64(1), object(2)
memory usage: 213.9+ KB
None


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 4 columns):
user_id      1296 non-null int64
movie_id     1296 non-null int64
tag          1296 non-null object
timestamp    1296 non-null int64
dtypes: int64(3), object(1)
memory usage: 40.6+ KB
None


Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997


------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
movie_id    9125 non-null int64
imdb_id     9125 non-null int64
tmdb_id     9112 non-null float64
dtypes: float64(1), int64(2)
memory usage: 213.9 KB
None


Unnamed: 0,movie_id,imdb_id,tmdb_id
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


------------------




## Testing One Hot Encoding

### Dummy data

In [4]:
# encoder = OneHotEncoder(categorical_features=[0,2], handle_unknown='ignore')
# dummy = np.array([[0, 40, 3], [1, 50, 0], [0, 45, 1], [1, 30, 2]])

encoder = OneHotEncoder(handle_unknown='ignore')
dummy = np.array([[0, 800], [1, 0], [0, 1], [1, 2]])

print dummy, '\n'
print encoder.fit_transform(dummy), '\n'
print encoder.fit_transform(dummy).todense()

[[  0 800]
 [  1   0]
 [  0   1]
 [  1   2]] 

  (0, 5)	1.0
  (0, 0)	1.0
  (1, 2)	1.0
  (1, 1)	1.0
  (2, 3)	1.0
  (2, 0)	1.0
  (3, 4)	1.0
  (3, 1)	1.0 

[[ 1.  0.  0.  0.  0.  1.]
 [ 0.  1.  1.  0.  0.  0.]
 [ 1.  0.  0.  1.  0.  0.]
 [ 0.  1.  0.  0.  1.  0.]]


### Clean Users and Movies

In [5]:
movie_mapping = (
    ratings
    .loc[:, ['movie_id']]
    .sort_values('movie_id')
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .set_index('movie_id')
    ['index']
    )

user_mapping = (
    ratings
    .loc[:, ['user_id']]
    .sort_values('user_id')
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .set_index('user_id')
    ['index']
    )

ratings2 = ratings.sort_values(['user_id', 'movie_id', 'timestamp'], ascending=[1,1,1])
ratings2.movie_id = ratings.movie_id.map(movie_mapping)
ratings2.user_id = ratings.user_id.map(user_mapping)

In [6]:
display((
    ratings2
    .assign(user_id=lambda df: df.user_id)
    .assign(movie_id=lambda df: df.movie_id + len(df.user_id.unique()))
).head())

encoder = OneHotEncoder(categorical_features=[0,1], handle_unknown='ignore')
ratings_encoded = encoder.fit_transform(ratings.loc[:, ['user_id', 'movie_id', 'rating']])
print ratings_encoded.__repr__(), '\n'

for row in xrange(5):
    print ratings_encoded.getrow(row)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,701,2.5,1260759144
1,0,1504,3.0,1260759179
2,0,1530,3.0,1260759182
3,0,1577,2.0,1260759185
4,0,1602,4.0,1260759205


<100004x9738 sparse matrix of type '<type 'numpy.float64'>'
	with 300012 stored elements in COOrdinate format> 

  (0, 9737)	2.5
  (0, 701)	1.0
  (0, 0)	1.0
  (0, 9737)	3.0
  (0, 1504)	1.0
  (0, 0)	1.0
  (0, 9737)	3.0
  (0, 1530)	1.0
  (0, 0)	1.0
  (0, 9737)	2.0
  (0, 1577)	1.0
  (0, 0)	1.0
  (0, 9737)	4.0
  (0, 1602)	1.0
  (0, 0)	1.0


Formatting checks out when done manually

# Benchmark Testing on Movielens Data

The following code was adapted from:  
http://arogozhnikov.github.io/2016/02/15/TestingLibFM.html

In [28]:
def fitpredict_logistic(trainX, trainY, testX, classification=True, **params):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = LogisticRegression(**params)
        clf.fit(trainX, trainY)
        return clf.predict_proba(testX)[:, 1]
    else:
        clf = Ridge(**params)
        clf.fit(trainX, trainY)
        return clf.predict(testX)

def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        return clf.fit_predict(trainX, trainY, testX)
    
def fitpredict_libfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    train_file = 'libfm_train.txt'
    test_file = 'libfm_test.txt'
    with open(train_file, 'w') as f:
        dump_svmlight_file(trainX, trainY, f=f)
    with open(test_file, 'w') as f:
        dump_svmlight_file(testX, np.zeros(testX.shape[0]), f=f)
    task = 'c' if classification else 'r'
    terminal_output = !{os.environ['LIBFM']} -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output.libfm
    
    libfm_pred = pd.read_csv('output.libfm', header=None).values.flatten()
    return libfm_pred

def test_on_dataset(trainX, testX, trainY, testY, task_name, classification=True, use_pylibfm=True):
    algorithms = OrderedDict()
    algorithms['logistic'] = fitpredict_logistic
    algorithms['fastFM']   = fitpredict_fastfm
    algorithms['libFM']    = fitpredict_libfm
    results = pd.DataFrame()

    for name, fit_predict in tqdm_notebook(algorithms.items()):
        if name == 'fastFM':
            start = time.time()
            predictions = fit_predict(trainX, trainY, testX, classification=classification)
            spent_time = time.time() - start
            results.ix[name, 'time'] = spent_time
            if classification:
                results.ix[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
            else:
                rmse = np.mean((testY - predictions) ** 2) ** 0.5
                print rmse
                results.ix[name, 'RMSE'] = rmse
            
    all_results[task_name] = results
    with open('saved_results.pkl', 'w') as f:
        pickle.dump(all_results, f)
        
    return results

## Movielens 100k - No Side Data

In [19]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=False)

all_results = OrderedDict()
try:
    with open('./saved_results.pkl') as f:
        all_results = pickle.load(f)
except:
    pass

display(trainX.head()), '\n'
display(test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k, ids', classification=False))

Unnamed: 0,user,movie
98980,810,900
69824,803,754
9928,51,286
75599,734,180
95621,896,95


0.915183767885



Unnamed: 0,time,RMSE
fastFM,3.071642,0.915184


In [37]:
a, b, c = trainX, trainY, testX

In [43]:
%store a
%store b
%store c

Stored 'a' (DataFrame)
Stored 'b' (ndarray)
Stored 'c' (DataFrame)


In [47]:
def fitpredict_fastfm(trainX, trainY, testX, classification=False, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        clf.fit(trainX, trainY)
        return clf.predict(testX)
    
y_pred = fitpredict_fastfm(trainX, trainY, testX, classification=False)

In [49]:
def rmse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2) ** 0.5

rmse(testY, y_pred)

1.2265090491889807

In [33]:
y_pred

array([ 3.68386468,  3.60472251,  3.51373203, ...,  3.91739083,
        2.2853354 ,  3.95714795])

## Movielens 100k - With Side Data

In [50]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=True)
display(trainX.head()), '\n'

display(test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k', classification=False))

Unnamed: 0,user,movie,age,gender,occupation,zip,released,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
98980,692,1310,33,0,7,615,68,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69824,931,528,48,1,3,59,57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9928,216,553,12,1,13,110,67,0,1,1,...,0,0,0,0,0,0,0,0,0,0
75599,798,498,39,0,0,166,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95621,910,547,27,0,20,397,68,0,0,0,...,1,0,0,0,0,0,0,0,0,0


1.18657921322



Unnamed: 0,time,RMSE
fastFM,23.884366,1.186579


## Movielens 1M - No Side Data

In [None]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_1m(all_features=False)
trainX.head(), '\n'

display(test_on_dataset(trainX, testX, trainY, testY, task_name='ml-1m,ids', classification=False))

## Movielens 1M - With Side Data

In [None]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_1m(all_features=True)
trainX.head(), '\n'
display(test_on_dataset(trainX, testX, trainY, testY, task_name='ml-1m', classification=False))

# Train Test Split Data

In [181]:
X = ratings.loc[:, ['user_id', 'movie_id']].values
y = ratings.rating.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [182]:
def save_split_data(X, y, fn):
    df = (
        pd.concat(
            [pd.DataFrame(X, columns=['user_id', 'movie_id']),
             pd.DataFrame(y, columns=['target'])], axis=1)
    )

    df.to_csv('../data/' + fn, index=False)
    
save_split_data(X_train, y_train, 'train.csv')
save_split_data(X_test, y_test, 'test.csv')

# Dump to libfm format

In [183]:
print (
    'Unique Users:      {:}\n'
    'Unique Movies:     {:}\n'
    'Unique categories: {:}'
    .format(ratings.user_id.unique().shape[0],
            ratings.movie_id.unique().shape[0],
            ratings.user_id.unique().shape[0] + ratings.movie_id.unique().shape[0])
    )

Unique Users:      671
Unique Movies:     9066
Unique categories: 9737


In [184]:
encoder = OneHotEncoder(categorical_features=[0,1], handle_unknown='ignore')
encoder.fit(X)
X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

print 'Training Data:\n', X_train_encoded.__repr__(), '\n\n'
print 'Test Data:\n', X_test_encoded.__repr__()

Training Data:
<75003x9737 sparse matrix of type '<type 'numpy.float64'>'
	with 150006 stored elements in Compressed Sparse Row format> 


Test Data:
<25001x9737 sparse matrix of type '<type 'numpy.float64'>'
	with 50002 stored elements in Compressed Sparse Row format>


In [185]:
dump_svmlight_file(X_train_encoded, y_train, '../data/train.libfm')
dump_svmlight_file(X_test_encoded, y_test, '../data/test.libfm')

### Confirm data looks good

In [176]:
print X_train_encoded[:5]

  (0, 2825)	1.0
  (0, 455)	1.0
  (1, 7720)	1.0
  (1, 663)	1.0
  (2, 1649)	1.0
  (2, 22)	1.0
  (3, 2572)	1.0
  (3, 118)	1.0
  (4, 4540)	1.0
  (4, 227)	1.0


In [212]:
t = pd.DataFrame(X_train[:5]).copy()
t

Unnamed: 0,0,1
0,456,2692
1,664,63062
2,23,1222
3,119,2391
4,228,4993


In [213]:
new_movie_mapping = movie_mapping + 1 + user_mapping.max()
t[0] = t[0].map(user_mapping)
t[1] = t[1].map(new_movie_mapping)
t

Unnamed: 0,0,1
0,455,2825
1,663,7720
2,22,1649
3,118,2572
4,227,4540


# Run Data in LibFM

In [233]:
def print_terminal_output(terminal_output):
    for i, line in enumerate(terminal_output[:25] + terminal_output[-5:]):
        if i == 25:
            print '\n...\n'
        print line

In [246]:
terminal_output = !{os.environ['LIBFM']} -task r -method mcmc -train ../data/train.libfm -test ../data/test.libfm -iter 1000 -dim '1,1,25' -out ../data/test_preds.libfm

print_terminal_output(terminal_output)

----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=75003	num_values=150006	num_features=9736	min_target=0.5	max_target=5
Loading test... 	
has x = 0
has xt = 1
num_rows=25001	num_values=50002	num_features=9737	min_target=0.5	max_target=5
#relations: 0
Loading meta data...	
#Iter=  0	Train=1.02772	Test=1.0378
#Iter=  1	Train=0.936432	Test=0.985164
#Iter=  2	Train=0.918953	Test=0.965476
#Iter=  3	Train=0.907734	Test=0.952903
#Iter=  4	Train=0.895337	Test=0.943468
#Iter=  5	Train=0.885787	Test=0.935869

...

#Iter=995	Train=0.533883	Test=0.866048
#Ite

# Read in Predictions from libfm output.libfm

In [247]:
scores = pd.read_csv('../data/test_preds.libfm', header=None, names=['score'])

In [248]:
rss = (
    pd.concat([pd.DataFrame(X_test, columns=['user_id', 'movie_id']),
               pd.DataFrame(y_test, columns=['rating']),
               scores], axis=1)
    .assign(rss=lambda df: (df.score - df.rating)**2)
)

display(rss.head())

print 'RMSE Score: {:0.6}'.format(np.sqrt(rss.rss.sum() / len(rss)))

Unnamed: 0,user_id,movie_id,rating,score,rss
0,23,1625,4.5,3.8296,0.449436
1,564,2801,3.0,3.70466,0.496546
2,665,1541,3.0,2.59937,0.160504
3,574,49530,4.0,3.97654,0.00055
4,472,841,4.0,3.73423,0.070634


RMSE Score: 0.866077


# Read in Execution and Scores of various models / platforms

In [280]:
with open('./saved_results.pkl', 'r') as f:
    summary = pickle.load(f)
dfs = []
for key in summary:
    mi = pd.MultiIndex.from_tuples(zip(*[[key]*len(level2), level2]))
    df = summary[key].T.set_index(mi).T
    dfs.append(df)

display(pd.concat(dfs, axis=1))

Unnamed: 0_level_0,"ml100k, ids","ml100k, ids",ml100k,ml100k
Unnamed: 0_level_1,time,RMSE,time,RMSE
logistic,0.325394,0.942662,2.338653,0.942347
fastFM,3.047696,0.915184,26.342513,0.896543
libFM,4.251008,0.914273,31.676842,0.895503
