In [1]:
import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator

import os 
from os.path import join, split

from io import StringIO
import pandas as pd

data_path = (join(
    split(os.getcwd())[0],'data'))

In [2]:
h2o.init(url='http://192.168.1.100:54321')

Checking whether there is an H2O instance running at http://192.168.1.100:54321. connected.


0,1
H2O cluster uptime:,2 mins 37 secs
H2O cluster version:,3.10.5.4
H2O cluster version age:,22 days
H2O cluster name:,kuba
H2O cluster total nodes:,1
H2O cluster free memory:,7.101 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://192.168.1.100:54321


In [3]:
filename = join(data_path, join('ml-1m', 'ratings.dat'))
pd_ratings_frame = pd.read_csv(filename, sep='%', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
pd_ratings_frame.describe()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
count,1000209.0,1000209.0,1000209.0,1000209.0
mean,3024.512,1865.54,3.581564,972243700.0
std,1728.413,1096.041,1.117102,12152560.0
min,1.0,1.0,1.0,956703900.0
25%,1506.0,1030.0,3.0,965302600.0
50%,3070.0,1835.0,4.0,973018000.0
75%,4476.0,2770.0,4.0,975220900.0
max,6040.0,3952.0,5.0,1046455000.0


In [4]:
pd_movies_frame = pd.read_csv(join(data_path, 'ml-1m', 'movies.dat'), sep='%', names=['MovieID', 'Title', 'Genres'], encoding='latin-1')

pd_all_frame = pd_ratings_frame.merge(pd_movies_frame[['MovieID', 'Title']], on='MovieID')
pd_all_frame.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975)
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975)
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975)
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975)
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975)


In [5]:
pd_ratings_frame = pd_all_frame.pivot(index='UserID', columns='Title', values='Rating')
pd_ratings_frame.head()

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [6]:
ratings_frame = (h2o.H2OFrame(pd_ratings_frame,
                  column_types=list(map(lambda x: 'float', range(pd_ratings_frame.shape[1]))),
                  na_strings=['nan']))

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Your ratings

In [41]:
my_ratings = list(zip(*[
  ['Matrix, The (1999)', 5],
  ['Godfather, The (1972)', 4],
  ['Fear and Loathing in Las Vegas (1998)', 5],
  ['Trainspotting (1996)', 4],
  ['Clockwork Orange, A (1971)', 5],
  ['Full Metal Jacket (1987)', 5],
  ['Total Recall (1990)', 4],
  ['Alien (1979)', 5],
  ['GoodFellas (1990)', 4],
  ['American Psycho (2000)', 5],
  ['Home Alone 2: Lost in New York (1992)', 3],
  ['2001: A Space Odyssey (1968)', 5],
  ['Star Wars: Episode I - The Phantom Menace (1999)', 1],
  ['Conan the Barbarian (1982)', 2]]))

my_ratings_df = pd.DataFrame(data={'UserId': [0 for __ in range(len(my_ratings[0]))], 'Title': my_ratings[0], 'Rating': my_ratings[1]})
my_ratings_df

Unnamed: 0,Rating,Title,UserId
0,5,"Matrix, The (1999)",0
1,4,"Godfather, The (1972)",0
2,5,Fear and Loathing in Las Vegas (1998),0
3,4,Trainspotting (1996),0
4,5,"Clockwork Orange, A (1971)",0
5,5,Full Metal Jacket (1987),0
6,4,Total Recall (1990),0
7,5,Alien (1979),0
8,4,GoodFellas (1990),0
9,5,American Psycho (2000),0


In [42]:
pivoted_df = (my_ratings_df
              .pivot(index='UserId', columns='Title', values='Rating')
              )

In [43]:
full_my_df = pivoted_df.reindex(columns=pd_ratings_frame.columns)
full_my_df.shape

(1, 3706)

## Your recommendations H2O Frame

In [44]:
pd_full_frame = pd.concat([full_my_df, pd_ratings_frame], axis=0)

full_h2o_frame = h2o.H2OFrame(pd_full_frame,
                            header=-1,
                            column_types=['numeric' for __ in  range(pd_full_frame.shape[1])],
                            na_strings=['nan'])

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Training GLRM model

In [45]:
from h2o.estimators import H2OGeneralizedLowRankEstimator

In [46]:
glrm = H2OGeneralizedLowRankEstimator(
  k=10,
  regularization_x='L1',
  regularization_y='L1',
  transform='Demean',
  max_iterations=50)

%time glrm.train(training_frame=full_h2o_frame)

glrm Model Build progress: |██████████████████████████████████████████████| 100%
CPU times: user 1.72 s, sys: 84 ms, total: 1.8 s
Wall time: 2min 36s


Unfortunately loading GLRM models seems to crash H2O...

model_path = h2o.save_model(model=glrm, path=join(data_path, 'glrm_model'))

model_path

h2o.load_model(model_path)

In [53]:
%time reconstructed_frame = glrm.reconstruct(full_h2o_frame, reverse_transform=True)
reconstructed_pd_frame = reconstructed_frame.head().as_data_frame()
reconstructed_pd_frame.columns = reconstructed_pd_frame.columns.str.replace('reconstr_', '')


CPU times: user 532 ms, sys: 8 ms, total: 540 ms
Wall time: 5.68 s


In [54]:
%store reconstructed_pd_frame

Stored 'reconstructed_pd_frame' (DataFrame)


## Print your favorite 50 movies according to model 

In [60]:
reconstructed_pd_frame.iloc[0].sort_values(ascending=False)[:20]

American History X (1998)               5.375186
13th Warrior, The (1999)                5.301784
American Beauty (1999)                  5.239200
American Pie (1999)                     5.211827
American Graffiti (1973)                5.180299
8MM (1999)                              5.176998
Amityville Horror, The (1979)           5.127135
Abyss, The (1989)                       5.105018
Amistad (1997)                          5.051575
Any Given Sunday (1999)                 5.023965
Apocalypse Now (1979)                   5.017250
187 (1997)                              5.012982
Aliens (1986)                           5.004416
Anatomy of a Murder (1959)              4.972820
39 Steps, The (1935)                    4.967766
Angel and the Badman (1947)             4.959243
Albino Alligator (1996)                 4.945949
Amityville II: The Possession (1982)    4.937399
Anchors Aweigh (1945)                   4.932135
Alien (1979)                            4.901682
Name: 0, dtype: floa

Woah! I dont't think I'd like American Pie, but I actually really think that American History X, American Beauty and Apocalypse Now are great movies.