## Factorization Machines
Factorization Machines have been described as state of the art for many
recommendation systems. Yet, experience has shown these models to suffer from slow
training and local minima. Use a large(ish) dataset and characterize where FMs are easy
to fit and accurate and where they are not.
1. Start with models that have no side information, and are only user and item ratings.
Specifically, subsample datasets from small to large, and subsample users/items
from sparsely-populated to well-populated, and train and test FMs. Where do they
work the best? Where do they fail? Can you set good rules of thumbs for their
training and use?
2. Next use side information about users or items. Answer the same questions as
above.

In [145]:
import pandas as pd
import numpy as np
import pandasql as ps
import math

pd.set_option('display.max_rows', 10)

In [146]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [147]:
q = '''SELECT anime_id, genre FROM anime LIMIT 10'''
print(ps.sqldf(q, locals()))

   anime_id                                              genre
0     32281               Drama, Romance, School, Supernatural
1      5114  Action, Adventure, Drama, Fantasy, Magic, Mili...
2     28977  Action, Comedy, Historical, Parody, Samurai, S...
3      9253                                   Sci-Fi, Thriller
4      9969  Action, Comedy, Historical, Parody, Samurai, S...
5     32935             Comedy, Drama, School, Shounen, Sports
6     11061            Action, Adventure, Shounen, Super Power
7       820                     Drama, Military, Sci-Fi, Space
8     15335  Action, Comedy, Historical, Parody, Samurai, S...
9     15417  Action, Comedy, Historical, Parody, Samurai, S...


In [189]:
# the function to deal with the genre
def genre(data):
    subset = data[['anime_id', 'genre']].copy()
    subset.drop('genre', 1).join(subset['genre'].str.get_dummies(sep=', '))
    return subset

In [190]:
ret = genre(anime)
print(ret)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [166]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [174]:
anime.loc[:]['anime_id']

0        32281
1         5114
2        28977
3         9253
4         9969
         ...  
12289     9316
12290     5543
12291     5621
12292     6133
12293    26081
Name: anime_id, Length: 12294, dtype: int64

In [199]:
anime_copy = anime.copy()
anime_copy.drop('genre', 1).join(anime_copy['genre'].str.get_dummies(sep=', '))

subset = anime.copy()
subset.drop('genre',1).join(subset['genre'].str.get_dummies(sep=', '))

print(anime_copy)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [201]:
df = anime[['anime_id', 'genre']].copy()
df.drop('genre', 1).join(df['genre'].str.get_dummies(sep=', '))

Unnamed: 0,anime_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,28977,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [207]:
subset = anime[['anime_id', 'genre']].copy()
subset.drop('genre', 1).join(subset['genre'].str.get_dummies(sep=', '))


Unnamed: 0,anime_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,28977,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,5543,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,5621,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,6133,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [211]:
subset = anime.copy()
subset.drop('genre',1).join(subset['genre'].str.get_dummies(sep=', '))
print(subset)

       anime_id                                               name  \
0         32281                                     Kimi no Na wa.   
1          5114                   Fullmetal Alchemist: Brotherhood   
2         28977                                           Gintama°   
3          9253                                        Steins;Gate   
4          9969                                      Gintama&#039;   
...         ...                                                ...   
12289      9316       Toushindai My Lover: Minami tai Mecha-Minami   
12290      5543                                        Under World   
12291      5621                     Violence Gekiga David no Hoshi   
12292      6133  Violence Gekiga Shin David no Hoshi: Inma Dens...   
12293     26081                   Yasuji no Pornorama: Yacchimae!!   

                                                   genre   type episodes  \
0                   Drama, Romance, School, Supernatural  Movie        1   
1      

In [213]:
print(rating.head(10))

   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1
5        1       355      -1
6        1       356      -1
7        1       442      -1
8        1       487      -1
9        1       846      -1


In [214]:
q = '''SELECT COUNT(DISTINCT user_id) FROM rating'''
print(ps.sqldf(q, locals()))

   COUNT(DISTINCT user_id)
0                    73515


In [None]:
MAX_ITEM = 12294
MAX_USER = 73515