In [2]:
#Import all dependencies
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf

import hvplot.pandas
from sklearn.cluster import KMeans
import joblib

In [3]:
#Read in our movie data
movies = pd.read_csv(r"ml-25m (2)\ml-25m\movies.csv")
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#The following code will create columns for each genre which specify whether each movie is in that genre or not
#Create a dictionary to store our genres
genres = {}


In [5]:
#Split the genre column from the movie data into lists and store in a dictionary
for index, row in movies.iterrows():
    genres[row["movieId"]] = row["genres"].split("|")

In [6]:
#Create our list of genres for our genre dictionary
genre_list = ["Action",
"Adventure",
"Animation",
"Children's",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western",
"(no genres listed)"]

In [7]:
#Create a list that will store each genre column
my_list = []

In [8]:
#Fill all of the genre columns with 1 for "yes" or 0 for "no"
for genre in genre_list:
    my_list = []
    for index in movies.index:
        movie = movies["movieId"][index]
        if genre in genres[movie]:
            my_list.append(1)
        else:
            my_list.append(0)
    movies[genre] = my_list

In [9]:
#Clean up the movies DataFrame
movies = movies.drop(columns = ["genres", "title"])
movies.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
#The following csv is not used in the current iteration of our model, but could be used to increase its complexity
#Read in tag movie metadata
tag_names = pd.read_csv(r"ml-25m (2)/ml-25m/genome-tags.csv")
tag_list = tag_names["tagId"]
tag_scores = pd.read_csv(r"ml-25m (2)\ml-25m\genome-scores.csv")
tag_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [11]:
#Add tag data to the movie data through merge, thereby assigning tag_scores to their movies
merged_df = pd.merge(movies,tag_scores, on = ["movieId"], how = "left")

In [12]:
#Invert rows and columns so that tags are columns, preserving only movieId, tagId, and relevance
pivot_df = merged_df.pivot(index = "movieId", columns = "tagId", values = "relevance")

In [13]:
#Put this inverted data into the movie DataFrame
movies_with_tags = pd.merge(movies, pivot_df, left_on = "movieId", right_index = True, how = "left")

In [14]:
#Drop fully null columns
movies_with_tags = movies_with_tags.dropna(axis=1,how = 'all')
movies_with_tags.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,1119.0,1120.0,1121.0,1122.0,1123.0,1124.0,1125.0,1126.0,1127.0,1128.0
0,1,0,1,1,0,1,0,0,0,1,...,0.0405,0.01425,0.0305,0.035,0.14125,0.05775,0.039,0.02975,0.08475,0.022
1,2,0,1,0,0,0,0,0,0,1,...,0.0525,0.01575,0.0125,0.02,0.12225,0.03275,0.021,0.011,0.10525,0.01975
2,3,0,0,0,0,1,0,0,0,0,...,0.06275,0.0195,0.02225,0.023,0.122,0.03475,0.017,0.018,0.091,0.01775
3,4,0,0,0,0,1,0,0,1,0,...,0.05325,0.028,0.01675,0.03875,0.182,0.0705,0.01625,0.01425,0.0885,0.015
4,5,0,0,0,0,1,0,0,0,0,...,0.0535,0.0205,0.01425,0.0255,0.19225,0.02675,0.01625,0.013,0.087,0.016


In [15]:
#Save the movies_with_tags DataFrame to csv for future use
movies_with_tags.to_csv(r"ml-25m (2)\ml-25m\movies_with_tags.csv")

In [16]:
#Read in customer rating data
ratings = pd.read_csv(r"ml-25m (2)\ml-25m\ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [17]:
#Join movie and ratings data so that each rating is paired with the movie metadata
ratings = ratings.join(movies.set_index("movieId"), on = "movieId", how = "left")
ratings = ratings.drop(columns=["Children's"])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,1,296,5.0,1147880044,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
1,1,306,3.5,1147868817,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,307,5.0,1147868828,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,665,5.0,1147878820,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,899,3.5,1147868510,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [18]:
#Replace zeroes in the ratings DataFrame with null values so that they will not affect our aggregate calculations
genre_columns = ['Action', 'Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']

for genre in genre_columns:
    ratings[genre] = ratings[genre]*ratings["rating"]
    ratings[genre] = ratings[genre].replace(0,float('nan'))

In [19]:
#Create a DataFrame that contains users' average ratings by genre
weighted_ratings = ratings.groupby("userId").mean().drop(columns=["movieId", "timestamp", "rating"])
weighted_ratings = weighted_ratings.fillna(0)
weighted_ratings.head()

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,4.125,3.727273,4.0,3.869565,4.0625,2.0,3.867925,3.6,3.5,3.5,3.7,3.5,4.166667,3.7,4.3,3.9,2.5,0.0
2,3.69697,3.906667,3.617647,3.31746,3.138889,0.0,3.571429,3.982759,0.0,4.0,3.272727,3.6875,3.161765,4.017857,3.833333,3.433333,2.5,0.0
3,3.640719,3.689394,3.98,3.454545,3.886364,3.166667,3.890086,3.737179,4.3,3.544444,3.583333,3.891667,3.533333,3.696429,3.692469,3.692308,3.75,3.5
4,3.186207,3.074561,3.467742,3.611111,3.972973,4.1,3.744898,2.833333,0.0,3.15,3.642857,3.805556,3.45,3.164706,3.544643,3.333333,2.833333,0.0
5,3.722222,3.857143,3.75,3.571429,4.142857,0.0,3.822222,3.5,0.0,4.666667,3.714286,4.714286,3.55,4.090909,4.0,4.5,3.0,0.0


In [20]:
#Create a DataFrame that specifies how often a user rates a genre
ratings_counts = ratings.groupby("userId").count().drop(columns=["movieId", "timestamp"])
ratings_counts.head()

Unnamed: 0_level_0,rating,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,70,4,11,2,23,8,1,53,5,1,1,5,4,18,5,5,5,1,0
2,184,66,75,17,63,18,0,91,29,0,3,11,8,34,28,30,15,5,0
3,656,334,198,50,176,132,3,232,78,5,45,6,60,60,224,239,26,8,1
4,242,145,114,31,81,37,5,49,39,0,10,7,18,10,85,56,9,9,0
5,101,18,21,4,49,14,0,45,8,0,3,7,7,20,11,24,2,5,0


In [32]:
#Turn these counts into weighted percentages like in StandardScaler so that they do not overweight the model
for genre in genre_columns:
    ratings_counts[genre] = 5*ratings_counts[genre]/ratings_counts["rating"]
ratings_counts.head()

Unnamed: 0_level_0,rating,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,70,0.020408,0.056122,0.010204,0.117347,0.040816,0.005102,0.270408,0.02551,0.005102,0.005102,0.02551,0.020408,0.091837,0.02551,0.02551,0.02551,0.005102,0.0
2,184,0.048736,0.055382,0.012553,0.046521,0.013292,0.0,0.067196,0.021414,0.0,0.002215,0.008123,0.005907,0.025106,0.020676,0.022153,0.011076,0.003692,0.0
3,656,0.019403,0.011503,0.002905,0.010225,0.007668,0.000174,0.013478,0.004531,0.00029,0.002614,0.000349,0.003486,0.003486,0.013013,0.013884,0.00151,0.000465,5.8e-05
4,242,0.061898,0.048665,0.013233,0.034578,0.015795,0.002134,0.020917,0.016648,0.0,0.004269,0.002988,0.007684,0.004269,0.036285,0.023905,0.003842,0.003842,0.0
5,101,0.044113,0.051466,0.009803,0.120086,0.03431,0.0,0.110283,0.019606,0.0,0.007352,0.017155,0.017155,0.049015,0.026958,0.058818,0.004901,0.012254,0.0


In [22]:
#Combine our two new DataFrames to form the inputs for our model
weighted_ratings= pd.merge(ratings_counts,weighted_ratings, on = "userId", how = "left")
weighted_ratings = weighted_ratings.drop(columns=["rating"])
weighted_ratings.head()

Unnamed: 0_level_0,Action_x,Adventure_x,Animation_x,Comedy_x,Crime_x,Documentary_x,Drama_x,Fantasy_x,Film-Noir_x,Horror_x,...,Film-Noir_y,Horror_y,Musical_y,Mystery_y,Romance_y,Sci-Fi_y,Thriller_y,War_y,Western_y,(no genres listed)_y
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.285714,0.785714,0.142857,1.642857,0.571429,0.071429,3.785714,0.357143,0.071429,0.071429,...,3.5,3.5,3.7,3.5,4.166667,3.7,4.3,3.9,2.5,0.0
2,1.793478,2.038043,0.461957,1.711957,0.48913,0.0,2.472826,0.788043,0.0,0.081522,...,0.0,4.0,3.272727,3.6875,3.161765,4.017857,3.833333,3.433333,2.5,0.0
3,2.545732,1.509146,0.381098,1.341463,1.006098,0.022866,1.768293,0.594512,0.03811,0.342988,...,4.3,3.544444,3.583333,3.891667,3.533333,3.696429,3.692469,3.692308,3.75,3.5
4,2.995868,2.355372,0.640496,1.673554,0.764463,0.103306,1.012397,0.805785,0.0,0.206612,...,0.0,3.15,3.642857,3.805556,3.45,3.164706,3.544643,3.333333,2.833333,0.0
5,0.891089,1.039604,0.19802,2.425743,0.693069,0.0,2.227723,0.39604,0.0,0.148515,...,0.0,4.666667,3.714286,4.714286,3.55,4.090909,4.0,4.5,3.0,0.0


In [23]:
#Run an elbow test to determine a minimum cluster number
inertia = []
k = list(range(1,50))

In [24]:
#Iterate through possible cluster numbers
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1, n_init='auto')
    k_model.fit(weighted_ratings)
    inertia.append(k_model.inertia_)

In [25]:
#Store the inertia and elbow data
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [26]:
#Plot the elbow curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

We can see here that the main crook of the "elbow" happens at around 10 or 11, but since our computing resources are capable of it we will use a higher n_cluster value of 20.

In [27]:
#Initiate the model
model = KMeans(n_clusters=20, random_state=1, n_init='auto')

#Fit the model to our data
model.fit(weighted_ratings)

#Make cluster predictions for the main model, so that we can examine cluster results to determine if our model is accurate
clusters = model.predict(weighted_ratings)

In [28]:
#Copy our DataFrame and attach cluster results
weighted_ratings_with_cluster = weighted_ratings.copy()
weighted_ratings_with_cluster["cluster"] = clusters

In [29]:
#View a DataFrame summarizing cluster results to scan for trends
weighted_ratings_with_cluster.groupby("cluster").mean()

Unnamed: 0_level_0,Action_x,Adventure_x,Animation_x,Comedy_x,Crime_x,Documentary_x,Drama_x,Fantasy_x,Film-Noir_x,Horror_x,...,Film-Noir_y,Horror_y,Musical_y,Mystery_y,Romance_y,Sci-Fi_y,Thriller_y,War_y,Western_y,(no genres listed)_y
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.332477,1.068063,0.287927,1.720845,0.93748,0.095093,2.42485,0.517194,0.093498,0.317799,...,4.177636,3.687621,3.773704,4.005919,3.866869,3.751097,3.866829,4.055641,3.739196,0.003875
1,1.337403,1.027637,0.26356,1.814065,0.87822,0.092582,2.329693,0.509881,0.076808,0.365723,...,3.670535,2.910904,3.017741,3.402623,3.226741,3.039161,3.225607,3.419633,2.90702,0.047855
2,1.748773,1.093942,0.152213,1.19075,1.321043,0.02051,2.529834,0.388671,0.149855,0.354439,...,4.154238,3.419428,0.200107,3.985712,3.720192,3.658091,3.889767,3.873001,3.883745,0.11487
3,1.80791,1.422714,0.25713,1.620389,0.890469,0.01998,1.831447,0.576692,0.024624,0.502808,...,0.389818,2.50269,0.260395,3.502842,3.306658,3.614858,3.725314,0.098152,0.390267,0.074832
4,1.910802,1.144502,0.027276,1.417304,1.166348,0.019043,2.446973,0.301572,0.004592,0.307454,...,0.081169,3.028957,0.341082,3.376939,3.594356,3.427542,3.732705,3.812792,3.889047,0.048011
5,1.782016,1.492775,0.439728,1.78622,0.856675,0.001359,1.995563,0.634868,0.000622,0.291477,...,0.019822,3.901471,4.031302,4.033988,3.960726,3.853517,3.938495,4.146192,4.019119,0.105283
6,1.526133,1.519392,0.519632,2.131494,0.798863,0.006554,2.062951,0.587932,0.001815,0.188387,...,0.028026,2.28542,3.570507,0.060013,3.78146,3.377898,3.482947,3.756203,2.694643,0.014195
7,1.418736,1.204852,0.369149,1.93142,0.805466,0.101235,2.210349,0.576485,0.001368,0.265239,...,0.048604,3.240816,3.410384,3.653431,3.659112,3.488376,3.62767,3.81503,3.698973,0.063769
8,1.673068,0.993277,0.028384,1.291268,1.184195,0.027474,2.616071,0.340184,0.051476,0.361133,...,0.806718,2.842719,0.362078,3.638404,3.582841,3.513463,3.806578,4.057787,0.045059,0.064745
9,2.127293,1.750259,0.439483,1.304617,0.985827,0.006108,2.151643,0.720387,0.005445,0.265918,...,0.105405,2.699206,0.047159,3.736916,3.770077,3.870268,3.925921,4.123942,0.037078,0.113542


In [30]:
#Testing plot to check for visual trends between clusters
weighted_ratings_with_cluster.hvplot.scatter(
    x="Film-Noir_y", 
    y="Film-Noir_x", 
    by="cluster"
)

In [31]:
#Tester prediction based on fake data to check that the model runs as expected
my_prediction=model.predict(pd.DataFrame({'Action_x':5/19, 'Adventure_x':5/19, 'Animation_x':5/19, 'Comedy_x':5/19, 'Crime_x':5/19,
       'Documentary_x':5/19, 'Drama_x':5/19, 'Fantasy_x':5/19, 'Film-Noir_x':5/19, 'Horror_x':5/19,
       'Musical_x':5/19, 'Mystery_x':5/19, 'Romance_x':5/19, 'Sci-Fi_x':5/19, 'Thriller_x':5/19,
       'War_x':5/19, 'Western_x':5/19, '(no genres listed)_x':5/19,'Action_y':2, 'Adventure_y':5,
       'Animation_y':2, 'Comedy_y':3, 'Crime_y':4, 'Documentary_y':4, 'Drama_y':4,
       'Fantasy_y':1, 'Film-Noir_y':1, 'Horror_y':2, 'Musical_y':0, 'Mystery_y':5, 'Romance_y':4,
       'Sci-Fi_y':2, 'Thriller_y':1, 'War_y':5, 'Western_y':5, '(no genres listed)_y':1}, index=[0]))
my_prediction

array([7])

In [33]:
#Export the model for use in Flask
joblib.dump(model, 'kmeans_model.pkl')

['kmeans_model.pkl']