In [2]:
import joblib
import pandas as pd

# Setup

In [21]:
#The next two cells are hard-coded inputs for testing purposes.

inputs1 = {'Action':4.125, 'Adventure':3.727,
       'Animation':4, 'Children':3.83, 'Comedy':3.869, 'Crime':4.0625, 'Documentary':2, 'Drama':3.867,
       'Fantasy':3.6, 'Film-Noir':3.5, 'Horror':3.5, 'Musical':3.7, 'Mystery':3.5, 'Romance':4.16,
       'Sci-Fi':3.7, 'Thriller':4.3, 'War':3.9, 'Western':2.5, '(no genres listed)':0}
sum_data = []
count = 0
for i in inputs1:
    if inputs1[i] != 0:
        count += 1
        sum_data.append(inputs1[i])
user_mean = sum(sum_data)/count

for key in inputs1:
    inputs1[key] = (inputs1[key]-user_mean)/user_mean

In [22]:
inputs2 = {'Action':0.313, 'Adventure':0.255, 'Animation':0.068, 'Children':0.896, 'Comedy':0.347, 'Crime':0.179,
       'Documentary':0.008, 'Drama':0.442, 'Fantasy':0.115, 'Film-Noir':0.008, 'Horror':0.062,
       'Musical':0.039, 'Mystery':0.079, 'Romance':0.188, 'Sci-Fi':0.179, 'Thriller':0.280,
       'War':0.056, 'Western':0.019, '(no genres listed)':0.000}


In [23]:
movie_input = 'Pulp Fiction (1994)'

# Model Application

In [24]:
#Load both models and have them make predictions
model1 = joblib.load("Models/weights_model.pkl")
model2 = joblib.load("Models/counts_model.pkl")

prediction1 = model1.predict(pd.DataFrame(inputs1, index = [0]))
prediction2 = model2.predict(pd.DataFrame(inputs2, index = [0]))

In [25]:
#This combines the predictions to make an overall cluster
cluster = int(str(prediction1[0])+str(prediction2[0]))

In [26]:
#Read in the csvs
movie_df = pd.read_csv("Resources/movies.csv")
genre_df = pd.read_csv("Resources/movies_modified.csv")
df = pd.read_csv("Resources/average_ratings.csv")
counts_df = pd.read_csv("Resources/ratings_counts.csv")

#Retrieve the movieId
movie_id = movie_df.loc[movie_df["title"]==movie_input].values[0][0]

#Retrieve the genres
genre_list = genre_df.loc[genre_df["movieId"]==movie_id].values[0][2:]

In [27]:
#Collect average ratings by genre per cluster
means = df.loc[df["clusters"]==cluster].groupby("clusters").mean()
pre_weights = counts_df.loc[counts_df["clusters"]==cluster].groupby("clusters").mean()

#Calculate the average rating for the input movie
means = means.values[0][1:]
pre_weights = pre_weights.values[0][1:]


In [28]:
weights_list = []
for i in range(len(genre_list)):
    weights_list.append(means[i]*genre_list[i])
output_average = sum(weights_list)/sum(genre_list)*user_mean+user_mean

#This is our output guess at the odds that the user would like this movie taking into account how this specific user tends to rate
output_percent = round(output_average/5*100)
print(movie_input +" is a "+str(round(output_average/5*100))+"% match for you.")

Pulp Fiction (1994) is a 74% match for you.


# Movie Recommendation Attempt 1 (Less Accurate)

In [29]:
#Alternative movie recommendation code: Based on best-rated categories of each cluster
genre_columns = ['Action', 'Adventure',
       'Animation', 'Children','Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']

sums = []
counts = []
averages =[]

for index, row in genre_df.iterrows():
    counts.append(sum(row.values[2:]))

for genre in genre_columns:
    genre_df[genre] = genre_df[genre]*means[genre_columns.index(genre)]*pre_weights[genre_columns.index(genre)]

for index, row in genre_df.iterrows():
    sums.append(sum(row.values[2:]))

for i in range(len(sums)):
    if counts[i] != 0:
        averages.append(sums[i]/counts[i])
    else:
        averages.append(0)

genre_df["predicted_average"] = averages

In [30]:
#Take the top 10 recommendations by weighted estimate rating
top_recommendations = genre_df.sort_values(by = "predicted_average", ascending=False)["movieId"][0:10].values
top_recommendations

array([  8014, 164873,  91065,  26714,  26719, 124394, 165359,  26728,
       124416, 124420], dtype=int64)

In [31]:
print("We would also recommend that you check out:")
for i in range(0,10):
    print(movie_df.loc[movie_df["movieId"]==top_recommendations[i], "title"].values[0])

We would also recommend that you check out:
Spring, Summer, Fall, Winter... and Spring (Bom yeoreum gaeul gyeoul geurigo bom) (2003)
Little Crumb (1999)
Bluebeard (Landru) (1963)
Across the Tracks (1991)
Cadence (1990)
If Winter Comes (1947)
Terror - Ihr Urteil (2016)
Guilty by Suspicion (1991)
I, a Man (1967)
Women in Revolt (1971)


# Movie Recommendation Attempt 2 (Improved)

In [14]:
#Secondary method fo rcalculating movie recommendations: Best movies by ratings of others in cluster
ratings = pd.read_csv("Resources/ratings_updated.csv")

In [15]:
#Limit DataFrame to other cluster members and calculate average rating by movie
user_cluster = ratings.loc[ratings["cluster"]==cluster]
movie_means = user_cluster.groupby("movieId").mean()

In [19]:
#Take the top 10 recommendations by weighted estimate rating
top_recommendations = movie_means.sort_values(by = "rating", ascending=False).index[0:10]
top_recommendations

Index([5489, 100714, 2356, 2357, 2362, 96832, 956, 2889, 99437, 99917], dtype='int64', name='movieId')

In [20]:
print("We would also recommend that you check out:")
for i in range(0,10):
    print(movie_df.loc[movie_df["movieId"]==top_recommendations[i], "title"].values[0])

We would also recommend that you check out:
Nosferatu the Vampyre (Nosferatu: Phantom der Nacht) (1979)
Before Midnight (2013)
Celebrity (1998)
Central Station (Central do Brasil) (1998)
Glen or Glenda (1953)
Holy Motors (2012)
Penny Serenade (1941)
Mystery, Alaska (1999)
John Dies at the End (2012)
Upstream Color (2013)
