## Data Preparation

In [1]:
# Import libraries
import numpy as np
import pandas as pd

### Ratings File

In [2]:
# Read the Ratings File
path_to_rating = '../../ml-1m/ratings.dat'
ratings = pd.read_csv(path_to_rating, sep='::', engine='python', encoding='latin-1', 
                      names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Set max_userid to the maximum user_id in the ratings
max_userid = ratings['user_id'].drop_duplicates().max()
# Set max_movieid to the maximum movie_id in the ratings
max_movieid = ratings['movie_id'].drop_duplicates().max()

print(len(ratings), 'ratings loaded')

1000209 ratings loaded


In [3]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


### Users File

In [4]:
# Specify User's Age and Occupation Column
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }

In [5]:
# Read the Users File
path_to_users = '../../ml-1m/users.dat'
users = pd.read_csv(path_to_users, sep='::', engine='python', encoding='latin-1',
                    names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])

# Add description for age and occupation columns
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])

print(len(users), 'descriptions of', max_userid, 'users loaded.')

6040 descriptions of 6040 users loaded.


In [6]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


### Movies File

In [7]:
# Read the Movies File
path_to_movies = '../../ml-1m/movies.dat'
movies = pd.read_csv(path_to_movies, sep='::', engine='python', encoding='latin-1',
                     names=['movie_id', 'title', 'genres'])
print(len(movies), 'descriptions of', max_movieid, 'movies loaded.')

3883 descriptions of 3952 movies loaded.


In [8]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Content-Based RecSys

In [9]:
# Break up the big genre string into a string array
movies['genres'] = movies['genres'].str.split('|')
# Convert genres to string value
movies['genres'] = movies['genres'].fillna("").astype('str')

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

print("Shape of TF-IDF Matrix: ", tfidf_matrix.shape)

Shape of TF-IDF Matrix:  (3883, 127)


In [13]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print("Shape of Cosine Similarity Matrix: ", cosine_sim.shape)

Shape of Cosine Similarity Matrix:  (3883, 3883)


In [14]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [15]:
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [16]:
genre_recommendations('American Beauty (1999)').head(10)

44                              To Die For (1995)
71                   Kicking and Screaming (1995)
74                               Big Bully (1996)
83             Last Summer in the Hamptons (1995)
104    Nobody Loves Me (Keiner liebt mich) (1994)
131                              Nueba Yol (1995)
164                   Doom Generation, The (1995)
203                        Unstrung Heroes (1995)
216                       Boys on the Side (1995)
229                    Eat Drink Man Woman (1994)
Name: title, dtype: object

In [17]:
genre_recommendations('Shawshank Redemption, The (1994)').head(10)

25                                       Othello (1995)
26                                  Now and Then (1995)
29    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
30                               Dangerous Minds (1995)
35                              Dead Man Walking (1995)
39                      Cry, the Beloved Country (1995)
42                                   Restoration (1995)
52                                      Lamerica (1994)
54                                       Georgia (1995)
56                         Home for the Holidays (1995)
Name: title, dtype: object

In [18]:
genre_recommendations('Aladdin and the King of Thieves (1996)').head(10)

1050            Aladdin and the King of Thieves (1996)
2072                          American Tail, An (1986)
2073        American Tail: Fievel Goes West, An (1991)
2285                         Rugrats Movie, The (1998)
2286                              Bug's Life, A (1998)
3045                                Toy Story 2 (1999)
3542                             Saludos Amigos (1943)
3682                                Chicken Run (2000)
3685    Adventures of Rocky and Bullwinkle, The (2000)
236                              Goofy Movie, A (1995)
Name: title, dtype: object

In [20]:
import joblib

# Save cosine_sim object
joblib.dump(cosine_sim, "./cosine_sim.joblib", compress=True)

['./cosine_sim.joblib']