In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import json
import re
import cPickle as pickle
import time
from IPython.display import display
from collections import OrderedDict
from tqdm import tqdm_notebook

import load_problems
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.cross_validation import train_test_split
from fastFM.mcmc import FMClassification, FMRegression

%matplotlib inline
sns.set_context('notebook', font_scale=1.6)

# Read in Movielens Dataset

In [12]:
path = '/Users/scottcronin/Dropbox/data/movielens-small/'
fn_ratings = 'ratings.csv'
fn_tags = 'tags.csv'
fn_movies = 'movies.csv'
fn_links = 'links.csv'

format_columns = lambda x: re.sub(r'(?<=[a-z])(?=[A-Z])', '_', x).lower()

ratings = pd.read_csv(path + fn_ratings)
ratings.columns = ratings.columns.map(format_columns)
ratings = ratings.sort_values(['user_id', 'movie_id', 'timestamp'], ascending=[1,1,1])

movies = pd.read_csv(path + fn_movies)
movies.columns = movies.columns.map(format_columns)

tags = pd.read_csv(path + fn_tags)
tags.columns = tags.columns.map(format_columns)

links = pd.read_csv(path + fn_links)
links.columns = links.columns.map(format_columns)

In [13]:
def show_df(df):
    print df.info(null_counts=30)
    display(df.head(3))
    print '------------------\n\n'
    
show_df(ratings)
show_df(movies)
show_df(tags)
show_df(links)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100004 entries, 0 to 100003
Data columns (total 4 columns):
user_id      100004 non-null int64
movie_id     100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.8 MB
None


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182


------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
movie_id    9125 non-null int64
title       9125 non-null object
genres      9125 non-null object
dtypes: int64(1), object(2)
memory usage: 213.9+ KB
None


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296 entries, 0 to 1295
Data columns (total 4 columns):
user_id      1296 non-null int64
movie_id     1296 non-null int64
tag          1296 non-null object
timestamp    1296 non-null int64
dtypes: int64(3), object(1)
memory usage: 40.6+ KB
None


Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997


------------------


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
movie_id    9125 non-null int64
imdb_id     9125 non-null int64
tmdb_id     9112 non-null float64
dtypes: float64(1), int64(2)
memory usage: 213.9 KB
None


Unnamed: 0,movie_id,imdb_id,tmdb_id
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


------------------




## Testing One Hot Encoding

### Dummy data

In [None]:
encoder = OneHotEncoder(categorical_features=[0,2], handle_unknown='ignore')
dummy = np.array([[0, 40, 3], [1, 50, 0], [0, 45, 1], [1, 30, 2]])
# dummy = np.array([[0, 3], [1, 0], [0, 1], [1, 2]])

print dummy, '\n'
print encoder.fit_transform(dummy), '\n'
print encoder.fit_transform(dummy).todense()

### Clean Users and Movies

In [None]:
movie_mapping = (
    ratings
    .loc[:, ['movie_id']]
    .sort_values('movie_id')
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .set_index('movie_id')
    ['index']
    )

user_mapping = (
    ratings
    .loc[:, ['user_id']]
    .sort_values('user_id')
    .drop_duplicates()
    .reset_index(drop=True)
    .reset_index()
    .set_index('user_id')
    ['index']
    )

ratings2 = ratings.sort_values(['user_id', 'movie_id', 'timestamp'], ascending=[1,1,1])
ratings2.movie_id = ratings.movie_id.map(movie_mapping)
ratings2.user_id = ratings.user_id.map(user_mapping)

In [None]:
display((
    ratings2
    .assign(user_id=lambda df: df.user_id)
    .assign(movie_id=lambda df: df.movie_id + len(df.user_id.unique()))
).head())

encoder = OneHotEncoder(categorical_features=[0,1], handle_unknown='ignore')
ratings_encoded = encoder.fit_transform(ratings.loc[:, ['user_id', 'movie_id', 'rating']])
print ratings_encoded.__repr__(), '\n'

for row in xrange(5):
    print ratings_encoded.getrow(row)

Formatting checks out when done manually

# Read in Benchmark Movielens Data

In [95]:
def fitpredict_logistic(trainX, trainY, testX, classification=True, **params):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = LogisticRegression(**params)
        clf.fit(trainX, trainY)
        return clf.predict_proba(testX)[:, 1]
    else:
        clf = Ridge(**params)
        clf.fit(trainX, trainY)
        return clf.predict(testX)

def fitpredict_fastfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    if classification:
        clf = FMClassification(rank=rank, n_iter=n_iter)
        return clf.fit_predict_proba(trainX, trainY, testX)
    else:
        clf = FMRegression(rank=rank, n_iter=n_iter)
        return clf.fit_predict(trainX, trainY, testX)
    
def fitpredict_libfm(trainX, trainY, testX, classification=True, rank=8, n_iter=100):
    encoder = OneHotEncoder(handle_unknown='ignore').fit(trainX)
    trainX = encoder.transform(trainX)
    testX = encoder.transform(testX)
    train_file = 'libfm_train.txt'
    test_file = 'libfm_test.txt'
    with open(train_file, 'w') as f:
        dump_svmlight_file(trainX, trainY, f=f)
    with open(test_file, 'w') as f:
        dump_svmlight_file(testX, np.zeros(testX.shape[0]), f=f)
    task = 'c' if classification else 'r'
    !{os.environ['LIBFM']} -task $task -method mcmc -train $train_file -test $test_file -iter $n_iter -dim '1,1,$rank' -out output.libfm
    
    libfm_pred = pd.read_csv('output.libfm', header=None).values.flatten()
    return libfm_pred

def test_on_dataset(trainX, testX, trainY, testY, task_name, classification=True, use_pylibfm=True):
    algorithms = OrderedDict()
    algorithms['logistic'] = fitpredict_logistic
    algorithms['fastFM']   = fitpredict_fastfm
    algorithms['libFM']    = fitpredict_libfm
    results = pd.DataFrame()

    for name, fit_predict in tqdm_notebook(algorithms.items()):
        start = time.time()
        predictions = fit_predict(trainX, trainY, testX, classification=classification)
        spent_time = time.time() - start
        results.ix[name, 'time'] = spent_time
        if classification:
            results.ix[name, 'ROC AUC'] = roc_auc_score(testY, predictions)
        else:
            results.ix[name, 'RMSE'] = np.mean((testY - predictions) ** 2) ** 0.5
            
    all_results[task_name] = results
    with open('saved_results.pkl', 'w') as f:
        pickle.dump(all_results, f)
        
    return results

## Movielens 100k - No Side Data

In [99]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k()

all_results = OrderedDict()
try:
    with open('./saved_results.pkl') as f:
        all_results = pickle.load(f)
except:
    pass

test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k, ids', classification=False)

----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=75000	num_values=150000	num_features=2585	min_target=1	max_target=5
Loading test... 	
has x = 0
has xt = 1
num_rows=25000	num_values=49953	num_features=2570	min_target=0	max_target=0
#relations: 0
Loading meta data...	
#Iter=  0	Train=1.09686	Test=3.52148
#Iter=  1	Train=0.991496	Test=3.54666
#Iter=  2	Train=0.952497	Test=3.55908
#Iter=  3	Train=0.933834	Test=3.5653
#Iter=  4	Train=0.927805	Test=3.56799
#Iter=  5	Train=0.925	Test=3.5698
#Iter=  6	Train=0.924065	Test=3.57231
#Iter=  7	Train=0.924506

Unnamed: 0,time,RMSE
logistic,0.311443,0.942664
fastFM,3.043929,0.915184
libFM,4.34659,0.913785


In [102]:
display(all_results['ml100k, ids'])

Unnamed: 0,time,RMSE
logistic,0.311443,0.942664
fastFM,3.043929,0.915184
libFM,4.34659,0.913785


## Movielens 100k - With Side Data

In [104]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_100k(all_features=True)
display(trainX.head())
test_on_dataset(trainX, testX, trainY, testY, task_name='ml100k', classification=False)

Unnamed: 0,user,movie,age,gender,occupation,zip,released,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
98980,692,1310,33,0,7,615,68,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69824,931,528,48,1,3,59,57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9928,216,553,12,1,13,110,67,0,1,1,...,0,0,0,0,0,0,0,0,0,0
75599,798,498,39,0,0,166,30,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95621,910,547,27,0,20,397,68,0,0,0,...,1,0,0,0,0,0,0,0,0,0


----------------------------------------------------------------------------
libFM
  Version: 1.4.2
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=75000	num_values=1950000	num_features=3587	min_target=1	max_target=5
Loading test... 	
has x = 0
has xt = 1
num_rows=25000	num_values=649958	num_features=3587	min_target=0	max_target=0
#relations: 0
Loading meta data...	
#Iter=  0	Train=1.06404	Test=3.55201
#Iter=  1	Train=0.946712	Test=3.55643
#Iter=  2	Train=0.926629	Test=3.56272
#Iter=  3	Train=0.916591	Test=3.56678
#Iter=  4	Train=0.909914	Test=3.56898
#Iter=  5	Train=0.905324	Test=3.57067
#Iter=  6	Train=0.900602	Test=3.57208
#Iter=  7	Train=0

Unnamed: 0,time,RMSE
logistic,3.343506,0.942348
fastFM,26.306622,0.896543
libFM,31.071487,0.896075


In [105]:
display(all_results['ml100k'])

Unnamed: 0,time,RMSE
logistic,3.343506,0.942348
fastFM,26.306622,0.896543
libFM,31.071487,0.896075


## Movielens 1M - No Side Data

In [108]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_1m(all_features=False)
trainX.head()
test_on_dataset(trainX, testX, trainY, testY, task_name='ml-1m,ids', classification=False)

Unnamed: 0,user,movie
610738,3703,3541
324752,1923,756
808217,4836,1288
133807,866,1106
431857,2630,2857


In [None]:
display(all_results['ml-1m,ids'])

## Movielens 1M - With Side Data

In [None]:
trainX, testX, trainY, testY = load_problems.load_problem_movielens_1m(all_features=True)
trainX.head()
test_on_dataset(trainX, testX, trainY, testY, task_name='ml-1m', classification=False)

In [None]:
display(all_results['ml-1m'])

# Train Test Split Data

In [38]:
X = ratings.loc[:, ['user_id', 'movie_id']].values
y = ratings.rating.values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

NameError: name 'ratings' is not defined

In [15]:
def save_split_data(X, y, fn):
    df = (
        pd.concat(
            [pd.DataFrame(X_train, columns=['user_id', 'movie_id']),
             pd.DataFrame(y_train, columns=['target'])], axis=1)
    )

    df.to_csv('./data/' + fn, index=False)

In [16]:
save_split_data(X_train, y_train, 'train.csv')
save_split_data(X_test, y_test, 'test.csv')

# Dump to libfm format

In [17]:
print (
    'Unique Users:      {:}\n'
    'Unique Movies:     {:}\n'
    'Unique categories: {:}'
    .format(ratings.user_id.unique().shape[0],
            ratings.movie_id.unique().shape[0],
            ratings.user_id.unique().shape[0] + ratings.movie_id.unique().shape[0])
    )

Unique Users:      671
Unique Movies:     9066
Unique categories: 9737


In [18]:
encoder = OneHotEncoder(categorical_features=[0,1], handle_unknown='ignore')
encoder.fit(X)

OneHotEncoder(categorical_features=[0, 1], dtype=<type 'float'>,
       handle_unknown='ignore', n_values='auto', sparse=True)

In [23]:
encoder = OneHotEncoder(categorical_features=[0,1], handle_unknown='ignore')
encoder.fit(X)
X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

print 'Training Data:\n', X_train_encoded.__repr__(), '\n\n'
print 'Test Data:\n', X_test_encoded.__repr__()

Training Data:
<75003x9737 sparse matrix of type '<type 'numpy.float64'>'
	with 150006 stored elements in Compressed Sparse Row format> 


Test Data:
<25001x9737 sparse matrix of type '<type 'numpy.float64'>'
	with 50002 stored elements in Compressed Sparse Row format>


In [24]:
dump_svmlight_file(X_train_encoded, y_train, './data/train.libfm')
dump_svmlight_file(X_test_encoded, y_test, './data/test.libfm')

# Read in Predictions from libfm output.libfm

In [None]:
scores = pd.read_csv('./data/test_output.libfm', header=None, names=['score'])

In [None]:
rmse = pd.concat([ratings, scores], axis=1).assign(rss=lambda df: (df.score - df.rating)**2)
display(rmse.head())

print 'RMSE Score: {:0.6}'.format(np.sqrt(rmse.rss.sum() / len(rmse)))

In [11]:
X_train[:5]

array([[  456,  2692],
       [  664, 63062],
       [   23,  1222],
       [  119,  2391],
       [  228,  4993]])