## Recommender System using kNN (Preprocessing)

### - Ishita Kapur



### Dataset

MovieLens https://grouplens.org/datasets/movielens/20m/

#### Import the required libraries

In [1]:
import numpy as np
import pandas as pd

#### Read the ratings data into a dataframe

In [2]:
ratings = pd.read_csv('ml-20m/ml-20m/ratings.csv', header=0, usecols=range(3))
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


#### Read the movies data into a dataframe

In [3]:
movies = pd.read_csv('ml-20m/ml-20m/movies.csv', header=0, usecols=range(3))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Filter the movies data for movies with more than 10000 ratings

In [4]:
countByMovie = ratings.groupby('movieId').size().reset_index(name='countOfUsers')
frequent_movies = list(set(countByMovie.query('countOfUsers >= 10000').index))
movies_filter = ratings.movieId.isin(frequent_movies).values
print(movies_filter)

[ True False False ... False False False]


#### Filter the users data for users that have rated more than 120 movies

In [5]:
countByUser = ratings.groupby('userId').size().reset_index(name='countOfMovies')
active_users = list(set(countByUser.query('countOfMovies >= 120').index))
users_filter = ratings.userId.isin(active_users).values
print(users_filter)

[False False False ... False False False]


#### Filter the data and store it into a dataframe

In [6]:
filtered = ratings[movies_filter & users_filter]
print(filtered)

          userId  movieId  rating
178            2      110     4.0
185            2      589     5.0
186            2      891     2.0
190            2     1196     5.0
192            2     1214     5.0
...          ...      ...     ...
19999811  138492      163     3.5
19999820  138492     1196     4.5
19999825  138492     1234     4.0
19999829  138492     1304     4.5
19999845  138492     2571     5.0

[665875 rows x 3 columns]


#### Write the dataframe into a csv file which can be read into the notebook for building the recommender

In [7]:
filtered.to_csv('Dataset/preprocessedMovieLens.csv',header=True,index=False)

#### Convert categorical variable (genre) into dummy/indicator variables and update the dataframe

In [8]:
interestingMovies = filtered.movieId.unique()
filteredMovies = movies.loc[movies['movieId'].isin(interestingMovies)]
filteredMovies = pd.concat([filteredMovies, filteredMovies['genres'].str.get_dummies(sep='|')], axis=1)
del filteredMovies['genres']
print(filteredMovies)

      movieId                                              title  Action  \
0           1                                   Toy Story (1995)       0   
1           2                                     Jumanji (1995)       0   
3           4                           Waiting to Exhale (1995)       0   
4           5                 Father of the Bride Part II (1995)       0   
5           6                                        Heat (1995)       1   
...       ...                                                ...     ...   
7458     7761              Soft Skin, The (La peau douce) (1964)       0   
7466     7769  Legend of the Village Warriors (Bangrajan) (2000)       1   
7588     7953    Rabid Dogs (Kidnapped) (Cani arrabbiati) (1974)       0   
7596     7982  Tale of Two Sisters, A (Janghwa, Hongryeon) (2...       0   
7690     8191                   Anne of the Thousand Days (1969)       0   

      Adventure  Animation  Children  Comedy  Crime  Documentary  Drama  ...  \
0      

#### Write the dataframe into a csv file which can be read into the notebook for building the recommender

In [9]:
filteredMovies.to_csv('Dataset/preprocessedMovies.csv',header=True,index=False)