In [238]:
import joblib
import pandas as pd

In [239]:
inputs1 = {'Action':2, 'Adventure':5,
       'Animation':2, 'Comedy':3, 'Crime':4, 'Documentary':4, 'Drama':4,
       'Fantasy':1, 'Film-Noir':1, 'Horror':2, 'Musical':0, 'Mystery':5, 'Romance':4,
       'Sci-Fi':2, 'Thriller':1, 'War':5, 'Western':5, '(no genres listed)':1}

In [240]:
inputs2 = {'Action':10/19, 'Adventure':10/19, 'Animation':10/19, 'Comedy':5/19, 'Crime':5/19,
       'Documentary':5/19, 'Drama':5/19, 'Fantasy':5/19, 'Film-Noir':5/19, 'Horror':5/19,
       'Musical':5/19, 'Mystery':5/19, 'Romance':5/19, 'Sci-Fi':5/19, 'Thriller':5/19,
       'War':0/19, 'Western':0/19, '(no genres listed)':0/19}


In [241]:
#Load both models and have them make predictions
model1 = joblib.load("Models/weights_model.pkl")
model2 = joblib.load("Models/counts_model.pkl")

prediction1 = model1.predict(pd.DataFrame(inputs1, index = [0]))
prediction2 = model2.predict(pd.DataFrame(inputs2, index = [0]))

In [242]:
#This combines the predictions to make an overall cluster
cluster = int(str(prediction1[0])+str(prediction2[0]))

In [243]:
#Read in the csvs
movie_df = pd.read_csv("Resources/movies.csv")
genre_df = pd.read_csv("Resources/movies_modified.csv")
df = pd.read_csv("Resources/average_ratings.csv")

#Retrieve the movieId
movie_id = movie_df.loc[movie_df["title"]=="Jumanji (1995)"].values[0][0]

#Retrieve the genres
genre_list = genre_df.loc[genre_df["movieId"]==movie_id].values[0][2:]

In [244]:
#Collect average ratings by genre per cluster
means = df.loc[df["clusters"]==cluster].groupby("clusters").mean()

#Calculate the average rating for the input movie
means = means.values[0][2:]

In [245]:
#Find the average rating by the cluster of a movie with each of these genres
weights_list = []
for i in range(len(genre_list)):
    weights_list.append(means[i]*genre_list[i])
output_average = sum(weights_list)/sum(genre_list)

#This is our output guess at how they would rate the movie
output_average

3.5631408860213325

In [246]:
#Alternative movie recommendation code
genre_columns = ['Action', 'Adventure',
       'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']

sums = []
counts = []
averages =[]

for index, row in genre_df.iterrows():
    counts.append(sum(row.values[2:]))

for genre in genre_columns:
    genre_df[genre] = genre_df[genre]*means[genre_columns.index(genre)]

for index, row in genre_df.iterrows():
    sums.append(sum(row.values[2:]))

for i in range(len(sums)):
    if counts[i] != 0:
        averages.append(sums[i]/counts[i])
    else:
        averages.append(0)

genre_df["predicted_average"] = averages

In [247]:
means

array([3.47314399, 3.54647842, 3.62848177, 3.48067732, 3.60095174,
       3.33884037, 3.71257239, 3.57980335, 1.96920616, 3.307295  ,
       3.60999817, 3.61864258, 3.59928373, 3.50703428, 3.50908108,
       3.65033423, 3.35190154, 3.93155964])

In [248]:
genre_df.sort_values(by = ["predicted_average"], ascending=False).head(20)

Unnamed: 0.1,Unnamed: 0,movieId,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),predicted_average
55294,55294,191641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
39311,39311,156911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
25056,25056,122888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
39304,39304,156894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
39298,39298,156873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
25059,25059,122896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
49885,49885,179823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
58949,58949,199768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
39288,39288,156848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156
58952,58952,199774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93156,3.93156


In [249]:
top_recommendations = genre_df.sort_values(by = "predicted_average", ascending=False)["movieId"][0:10].values
top_recommendations

array([191641, 156911, 122888, 156894, 156873, 122896, 179823, 199768,
       156848, 199774], dtype=int64)

In [250]:
print("We would also recommend that you check out:")
for i in range(0,10):
    print(movie_df.loc[movie_df["movieId"]==top_recommendations[i], "title"].values[0])

We would also recommend that you check out:
The Man from the Restaurant (1927)
Lúcio Flávio, o Passageiro da Agonia (1977)
Ben-hur (2016)
A Way of Life (2004)
The Spiral Staircase (2000)
Pirates of the Caribbean: Dead Men Tell No Tales (2017)
Matilda (2017)
The New Math(s) (2000)
Touch and Go (1991)
NYC 3/94 (1994)
