# ML: Recommender Systems-1

### Content(Text) Based Similarity

In [4]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [5]:
movies = pd.read_csv('movies.csv') # Imdb 
ratings = pd.read_csv('ratings.csv')

In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


For Toy Story, if i pick a similar movie, it should be very close to the genre of this movie.

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


- rating: the rating the user with the user id in the "userId" column has given to the movie in the movieId.
- timestamp: at what timestamp did the user give the rating.
- if someone hasn't watched an old movie like shawshank redemption up until now and now he did and rated it, it tells something about that person. Maybe he's started watching these kind of old movies or he's following some list provided by someone.

In [8]:
movies.shape # imdb 

(10329, 3)

The data is too big for RAM to handle, so reducing it.

In [15]:
# ratings.movieId.value_counts()

296     325
356     311
318     308
480     294
593     290
       ... 
3360     26
2186     26
3735     26
65       26
4641     26
Name: movieId, Length: 1000, dtype: int64

In [10]:
# ratings.movieId.value_counts().head(1000).index

Int64Index([  296,   356,   318,   480,   593,   260,  2571,   589,   110,
              527,
            ...
              909,   933,  2991,  4641,  2410, 55276,   546,  5872,  2528,
             7502],
           dtype='int64', length=1000)

In [9]:
select_movies = ratings.movieId.value_counts().head(1000).index.to_list() # movies with more ratings
# filtering so we only have data of those 1000 movies (using index)
movies = movies.loc[movies.movieId.isin(select_movies)]       
ratings = ratings.loc[ratings.movieId.isin(select_movies)]

In [13]:
movies.shape

(1000, 3)

In [14]:
ratings.shape

(63250, 4)

In [17]:
# movies['genres'].str.split('|')

0        [Adventure, Animation, Children, Comedy, Fantasy]
1                           [Adventure, Children, Fantasy]
2                                        [Comedy, Romance]
4                                                 [Comedy]
5                                [Action, Crime, Thriller]
                               ...                        
9908                                       [Comedy, Drama]
9914                                        [Sci-Fi, IMAX]
9975                                [Action, Sci-Fi, IMAX]
10005                          [Action, Adventure, Sci-Fi]
10089                               [Drama, Thriller, War]
Name: genres, Length: 1000, dtype: object

In [18]:
# movies['genres'].str.split('|').explode('genres')

0       Adventure
1       Animation
2        Children
3          Comedy
4         Fantasy
          ...    
2784    Adventure
2785       Sci-Fi
2786        Drama
2787     Thriller
2788          War
Name: genres, Length: 2789, dtype: object

In [20]:
# movies['genres'].str.split('|').explode('genres')

0       Adventure
1       Animation
2        Children
3          Comedy
4         Fantasy
          ...    
2784    Adventure
2785       Sci-Fi
2786        Drama
2787     Thriller
2788          War
Name: genres, Length: 2789, dtype: object

In [35]:
# m = movies.copy()
# m['genres'] = m['genres'].str.split('|')
# # exploed: transform the list of genres into separate rows, 
# # duplicating the movie information for each genre associated with it.
# m = m.explode('genres')
# m.head(8)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
1,2,Jumanji (1995),Adventure
1,2,Jumanji (1995),Children
1,2,Jumanji (1995),Fantasy


In [31]:
# m = movies.copy()
# m['genres'] = m['genres'].str.split('|')
# # exploed: transform the list of genres into separate rows, 
# # duplicating the movie information for each genre associated with it.
# m = m.explode('genres')
# m = m.pivot(index='movieId', columns='genres', values='title')   # title is showing as binary values.
# m.head(2)

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,,Toy Story (1995),Toy Story (1995),Toy Story (1995),Toy Story (1995),,,,Toy Story (1995),,,,,,,,,,
2,,Jumanji (1995),,Jumanji (1995),,,,,Jumanji (1995),,,,,,,,,,


In [32]:
# m = movies.copy()
# m['genres'] = m['genres'].str.split('|')
# # exploed: transform the list of genres into separate rows, 
# # duplicating the movie information for each genre associated with it.
# m = m.explode('genres')
# m = m.pivot(index='movieId', columns='genres', values='title')   # title is showing as binary values.
# m = ~m.isna()
# m.head(2)

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,False,True,True,True,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False


In [30]:
# m = movies.copy()
# m['genres'] = m['genres'].str.split('|')
# # exploed: transform the list of genres into separate rows, 
# # duplicating the movie information for each genre associated with it.
# m = m.explode('genres')
# m = m.pivot(index='movieId', columns='genres', values='title')   # title is showing as values.
# m = ~m.isna() # this will make title True/False
# m = m.astype(int) # this will make title binary
# m.head()

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [10]:
m = movies.copy()
m['genres'] = m['genres'].str.split('|')
# exploed: transform the list of genres into separate rows, 
# duplicating the movie information for each genre associated with it.
m = m.explode('genres')
m = m.pivot(index='movieId', columns='genres', values='title')   # title is showing as values.
m = ~m.isna() # this will make title True/False
m = m.astype(int) # this will make title binary

In [11]:
m.head()

genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0


In [12]:
def hamming_distance(x, y):
    return sum(abs(x-y))

In [13]:
ranks = []

for query in m.index:    # index is the movieId.
    for candidate in m.index:    # so it's like m x m operation
        if candidate == query:     # if the same movie id in both
            continue
        ranks.append([query, candidate, hamming_distance(m.loc[query], m.loc[candidate])])
        
ranks = pd.DataFrame(ranks, columns=['query', 'candidate', 'distance'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='query', right_on='movieId').rename(columns={'title': 'query_tittle'}).drop(columns=['movieId'])
ranks = ranks.merge(movies[['movieId', 'title']], left_on='candidate', right_on='movieId').rename(columns={'title': 'candidate_tittle'}).drop(columns=['movieId'])
ranks = ranks.sort_values(by=['query', 'distance'])   # distance in ascending order
ranks.head()

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
539460,1,2294,0,Toy Story (1995),Antz (1998)
665334,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
792207,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
187812,1,673,1,Toy Story (1995),Space Jam (1996)
549450,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


In [14]:
ranks.loc[ranks['query']==1].head()           # querying movie with id 1

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
539460,1,2294,0,Toy Story (1995),Antz (1998)
666333,1,3114,0,Toy Story (1995),Toy Story 2 (1999)
791208,1,4886,0,Toy Story (1995),"Monsters, Inc. (2001)"
184815,1,673,1,Toy Story (1995),Space Jam (1996)
549450,1,2355,1,Toy Story (1995),"Bug's Life, A (1998)"


For Toy Story, Bug's Life is in the top 5, but is Toy Story going to be in the top 5 for Bug's Life too? Let's find out:

In [17]:
ranks.loc[ranks['query']==2355].head(10)           # not really. It's in the top 10 though. That too is Toy Story 2

Unnamed: 0,query,candidate,distance,query_tittle,candidate_tittle
804746,2355,5218,0,"Bug's Life, A (1998)",Ice Age (2002)
844706,2355,6377,0,"Bug's Life, A (1998)",Finding Nemo (2003)
167383,2355,588,1,"Bug's Life, A (1998)",Aladdin (1992)
188362,2355,720,1,"Bug's Life, A (1998)",Wallace & Gromit: The Best of Aardman Animatio...
194356,2355,745,1,"Bug's Life, A (1998)",Wallace & Gromit: A Close Shave (1995)
308242,2355,1223,1,"Bug's Life, A (1998)","Grand Day Out with Wallace and Gromit, A (1989)"
371179,2355,1367,1,"Bug's Life, A (1998)",101 Dalmatians (1996)
504046,2355,2085,1,"Bug's Life, A (1998)",101 Dalmatians (One Hundred and One Dalmatians...
540010,2355,2294,1,"Bug's Life, A (1998)",Antz (1998)
666884,2355,3114,1,"Bug's Life, A (1998)",Toy Story 2 (1999)


This was content(Text) based similarity. Text here is Genres.

### User Based Similarity: based on demographics (This is also Content Based)

In [24]:
r = ratings.copy()

In [25]:
r.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [26]:
# converts the timestamp to a datetime object using datetime.fromtimestamp(x) and then extracts the hour component using .hour
r['hour'] = r['timestamp'].apply(lambda x: datetime.fromtimestamp(x).hour)
r.head()

Unnamed: 0,userId,movieId,rating,timestamp,hour
0,1,16,4.0,1217897793,6
1,1,24,1.5,1217895807,5
2,1,32,4.0,1217896246,6
3,1,47,4.0,1217896556,6
4,1,50,4.0,1217896523,6


In [27]:
users = pd.read_csv('users.csv')

In [28]:
users.head()

Unnamed: 0,userId,age,time_spent_per_day
0,1,16,3.976315
1,2,24,1.891303
2,3,20,4.521478
3,4,23,2.095284
4,5,35,1.75986


Our hypoythesis here: 
- if we add more columns to this(more information), can we find out similar users based on that information.
- Based on the time(from the "r" df) at which they have rated the movie, can we find out what time zone they live in? 
    - For eg. 6 PM UTC might be evening for someone in Australia and morning for someone else in Europe.

In [30]:
# r.groupby('userId').rating.mean()

userId
1      3.691589
2      3.923077
3      3.806452
4      4.159420
5      2.864865
         ...   
664    3.964286
665    3.553763
666    3.642857
667    3.807692
668    3.044785
Name: rating, Length: 668, dtype: float64

In [22]:
# r.groupby('userId').rating.mean().reset_index()

Unnamed: 0,userId,rating
0,1,3.691589
1,2,3.923077
2,3,3.806452
3,4,4.159420
4,5,2.864865
...,...,...
663,664,3.964286
664,665,3.553763
665,666,3.642857
666,667,3.807692


In [31]:
users = users.merge(r.groupby('userId').rating.mean().reset_index(), on='userId')
users = users.merge(r.groupby('userId').hour.mean().reset_index(), on='userId')

In [32]:
users.head()

Unnamed: 0,userId,age,time_spent_per_day,rating,hour
0,1,16,3.976315,3.691589,5.616822
1,2,24,1.891303,3.923077,21.0
2,3,20,4.521478,3.806452,14.370968
3,4,23,2.095284,4.15942,8.0
4,5,35,1.75986,2.864865,0.513514


In [33]:
u = users.copy()
u = u.set_index('userId')      # so it doesn't get calculated while scaling the data. 
u.columns = ['age', 'time_spent_per_day', 'u_avg_rating', 'hour']

In [34]:
from sklearn.preprocessing import StandardScaler
# To compute euclidean distance, we need to scale the data.
scaler = StandardScaler()
u = pd.DataFrame(scaler.fit_transform(u), columns=u.columns, index=u.index)

In [35]:
u.head()

Unnamed: 0_level_0,age,time_spent_per_day,u_avg_rating,hour
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-1.470292,0.341073,-0.073572,-0.882006
2,-0.135616,-1.079947,0.426461,1.477906
3,-0.802954,0.712624,0.174541,0.460955
4,-0.30245,-0.940926,0.936982,-0.516406
5,1.699565,-1.169532,-1.859363,-1.664898


In [36]:
def euclidean_dist(x, y):
    return np.linalg.norm(x-y)

In [37]:
userid = 5     # since we have too much data, we're taking only user 5

In [40]:
# u.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            659, 660, 661, 662, 663, 664, 665, 666, 667, 668],
           dtype='int64', name='userId', length=668)

In [38]:
dist = []
for user in u.index:
    dist.append(euclidean_dist(u.loc[userid], u.loc[user]))   # find euclidean distance of one user with all others.

u_rank = pd.DataFrame()
u_rank['id'] = u.index
u_rank['dist'] = dist
u_rank = u_rank.loc[u_rank.id != userid]
u_rank = u_rank.sort_values(by='dist')
u_rank.head()

Unnamed: 0,id,dist
213,214,1.400996
124,125,1.559669
301,302,1.641682
409,410,1.657114
25,26,1.676895


User 5 and user 214 are the closest neighbors cuz they are having the least distance between them.

We have found the nearest neighbors. Now, how to use it for recommendation.

In [43]:
ratings.loc[ratings.userId==214].sort_values(by='rating', ascending=False).head(10)

Unnamed: 0,userId,movieId,rating,timestamp
29659,214,1242,5.0,1059599552
29668,214,2804,5.0,1059599624
29660,214,1302,4.0,1059599706
29666,214,2617,4.0,1059599605
29656,214,543,4.0,1059599577
29665,214,2423,4.0,1059599960
29663,214,1777,4.0,1059599948
29667,214,2770,3.5,1059599695
29651,214,44,3.5,1059599710
29661,214,1372,3.0,1059599647


We sort all the movies user 214 has watched by rating. We can give this movie id 214 as recommendation if user 5 hasn't watched it yet.

If you have any questions, get in touch with me [**here**](https://linktr.ee/khushalkumar31)