---

## Content Based Recommenders

In [56]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
select_movies = ratings.movieId.value_counts().head(1000).index.to_list()
movies = movies.loc[movies.movieId.isin(select_movies)]
ratings = ratings.loc[ratings.movieId.isin(select_movies)]

In [4]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [5]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807


In [6]:
m = movies.copy()
m['genres'] = m['genres'].str.split('|')
m = m.explode('genres')
m = m.pivot(index='movieId', columns='genres', values='title')
m = ~m.isna()
m = m.astype(int)

In [7]:
a = m.iloc[0].values
b = m.iloc[1].values

In [8]:
def hamming_distance(x, y):
    return sum(abs(x-y))

In [9]:
ranks = []

for query in m.index:
    for candidate in m.index:
        if candidate == query:
            continue
        ranks.append([query, candidate, hamming_distance(m.loc[query], m.loc[candidate])])
        
ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='query', right_on='movieId').rename(columns={'title': 'query_tittle'}).drop(columns=['movieId'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='candidate', right_on='movieId').rename(columns={'title': 'candidate_tittle'}).drop(columns=['movieId'])
ranks = ranks.sort_values(by=['query', 'distance'])
ranks.head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
539460,1,2294,0,Toy Story (1995),Antz (1998)
665334,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
792207,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
187812,1,673,1,Toy Story (1995),Space Jam (1996)
549450,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


In [10]:
ranks.distance.describe()

count    999000.000000
mean          4.146761
std           1.807754
min           0.000000
25%           3.000000
50%           4.000000
75%           5.000000
max          13.000000
Name: distance, dtype: float64

### item-item similarity based rec sys

In [11]:
ranks.loc[ranks['query']==1].head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
539460,1,2294,0,Toy Story (1995),Antz (1998)
665334,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
792207,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
187812,1,673,1,Toy Story (1995),Space Jam (1996)
549450,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


**ratings data**

In [12]:
r = ratings.copy()

In [13]:
r['hour'] = r['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)
r.head()

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,6
1,1,24,1.5,1217895807,5
2,1,32,4.0,1217896246,6
3,1,47,4.0,1217896556,6
4,1,50,4.0,1217896523,6


**users data**

In [14]:
users = pd.read_csv('users.csv')

In [15]:
users.head()

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303
2,3,20,4.521478
3,4,23,2.095284
4,5,35,1.75986


In [16]:
users = users.merge(r.groupby('userId').rating.mean().reset_index(), on='userId')
users = users.merge(r.groupby('userId').hour.mean().reset_index(), on='userId')

In [17]:
users.head()

Unnamed: 0,userId,age,time_spent_per_day,rating,hour
0,1,16,3.976315,3.691589,5.616822
1,2,24,1.891303,3.923077,21.0
2,3,20,4.521478,3.806452,14.370968
3,4,23,2.095284,4.15942,8.0
4,5,35,1.75986,2.864865,0.513514


In [18]:
u = users.copy()
u = u.set_index('userId')
u.columns = ['age', 'time_spent_per_day', 'u_avg_rating', 'hour']

In [19]:
u.columns

Index(['age', 'time_spent_per_day', 'u_avg_rating', 'hour'], dtype='object')

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)

In [21]:
u.head()

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.073572,-0.882006
2,-0.135616,-1.079947,0.426461,1.477906
3,-0.802954,0.712624,0.174541,0.460955
4,-0.30245,-0.940926,0.936982,-0.516406
5,1.699565,-1.169532,-1.859363,-1.664898


In [22]:
def euclidean_dist(x, y):
    return np.linalg.norm(x-y)

### Make recommendation for any user

In [23]:
userid = 5

In [26]:
dist = []
for user in u.index:
    dist.append(euclidean_dist(u.loc[userid], u.loc[user]))

u_rank = pd.DataFrame()
u_rank['id'] = u.index
u_rank['dist'] = dist
u_rank = u_rank.loc[u_rank.id != userid]
u_rank = u_rank.sort_values(by='dist')
u_rank.head()

Unnamed: 0,id,dist
213,214,1.400996
124,125,1.559669
301,302,1.641682
409,410,1.657114
25,26,1.676895


### user-user sim based rec-sys

In [27]:
ratings.loc[ratings.userId==214].sort_values(by='rating', ascending=False).head(10)

Unnamed: 0,userId,movieId,rating,timestamp
29659,214,1242,5.0,1059599552
29668,214,2804,5.0,1059599624
29660,214,1302,4.0,1059599706
29666,214,2617,4.0,1059599605
29656,214,543,4.0,1059599577
29665,214,2423,4.0,1059599960
29663,214,1777,4.0,1059599948
29667,214,2770,3.5,1059599695
29651,214,44,3.5,1059599710
29661,214,1372,3.0,1059599647


Other ideas:
- take top M movies from closest K neighbours
- multiple rating by 1/dist and resort.