# **Install and import necessary packages**

In [72]:
import os
import json
import gdown
import pandas as pd
import numpy as np
import sys
!pip install scikit-surprise
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
!pip install git+https://github.com/microsoft/recommenders.git
from recommenders.evaluation.python_evaluation import (
    map, ndcg_at_k, precision_at_k, recall_at_k
)
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import re
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset, Reader
from sklearn.metrics import f1_score, precision_score, recall_score
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

Collecting git+https://github.com/microsoft/recommenders.git
  Cloning https://github.com/microsoft/recommenders.git to /tmp/pip-req-build-5lg3066h
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/recommenders.git /tmp/pip-req-build-5lg3066h
  Resolved https://github.com/microsoft/recommenders.git to commit c2ea583d27bb1a4d58a09a1621d5ce95672ef1dc
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




# **Load dataset**

In [73]:
!gdown https://drive.google.com/uc?id=1vbteMX-kHyYAAS7JIrrog_aZvvOY77Q0
data = pd.read_csv('final_dataset.csv')
print(data.columns)
print(data.head())

Downloading...
From (original): https://drive.google.com/uc?id=1vbteMX-kHyYAAS7JIrrog_aZvvOY77Q0
From (redirected): https://drive.google.com/uc?id=1vbteMX-kHyYAAS7JIrrog_aZvvOY77Q0&confirm=t&uuid=a49a347f-22c7-4197-a8d2-8e5b771b8902
To: /content/final_dataset.csv
100% 456M/456M [00:05<00:00, 85.6MB/s]
Index(['Movie', 'Reviewer', 'Review', 'Score', 'Sentiment_scores', 'overview',
       'cast_list', 'crew_list', 'director_list', 'first_three_casts',
       'unique_genres', 'Date', 'recommend'],
      dtype='object')
       Movie          Reviewer  \
0  SENSATION   Dennis Schwartz   
1  SENSATION  Carey-Ann Pawsey   
2  SENSATION        Rob Rector   
3  SENSATION        Rich Cline   
4  SENSATION       Allen Adams   

                                              Review     Score  \
0  A sci-fi film that made no sense, as it tells ...  0.686869   
1  A thriller that ultimately makes little to no ...  0.393939   
2  There's still much to admire with Sensation, d...  0.595960   
3  There's

# **Prepare training and test sets**

In [74]:
#split data into train and test sets, make sure that only users and items present in train set appear in test set
cf_data = data.loc[:, ['Reviewer', 'Movie', 'Score', 'recommend']]
train, test = train_test_split(cf_data, test_size = 0.15, random_state = 42, shuffle = True)
test = test[test["Reviewer"].isin(train["Reviewer"].unique())]
test = test[test["Movie"].isin(train["Movie"].unique())]

# **Content-based filtering**



*   create a dataset of unique movies



In [75]:
#keep only unique movies
unique_movies = data.drop_duplicates(subset='Movie')
unique_movies = unique_movies.reset_index()




*   convert genres to lower case and combine into 1 string
*   convert director list to lower case, join first and last name, combine into 1 string
*   convert first 3 casts to lower case, join first and last name, combine into 1 string







In [76]:
#convert genres to lowercase and combine into 1 long string
unique_movies['unique_genres'] = unique_movies['unique_genres'].apply(lambda x: eval(x))
unique_movies['processed_genres'] = unique_movies['unique_genres'].apply(lambda x: ' '.join([word.lower() for word in x]))
print(unique_movies['processed_genres'])

#convert director list to lower case, join first and last name, combine into 1 long string
unique_movies['director_list'] = unique_movies['director_list'].apply(lambda x: eval(x))

def process_names(names_list):
  processed_names = [''.join(name.lower().split()) for name in names_list]
  return ' '.join(processed_names)

unique_movies['processed_directors'] = unique_movies['director_list'].apply(lambda x: process_names(x))
print(unique_movies['processed_directors'])

#convert first 3 casts to lower case, join first and last name, combine into 1 long string
unique_movies['first_three_casts'] = unique_movies['first_three_casts'].apply(lambda x: eval(x))
unique_movies['processed_cast'] = unique_movies['first_three_casts'].apply(lambda x: process_names(x))
print(unique_movies['processed_cast'])

0       drama romance comedy foreign
1                  drama history war
2                   thriller mystery
3                             comedy
4             drama thriller mystery
                    ...             
5211                          comedy
5212           drama thriller comedy
5213                        thriller
5214       drama history war foreign
5215                  romance comedy
Name: processed_genres, Length: 5216, dtype: object
0                           tomhall
1                     romanpolanski
2                  wolfgangpetersen
3                      chuckvincent
4                  barbarastepansky
                   ...             
5211    jasonfriedberg aaronseltzer
5212                      henrybean
5213                    chrisjaymes
5214                 enniodeconcini
5215                     carlreiner
Name: processed_directors, Length: 5216, dtype: object
0                luannegordon kellycampbell owenroe
1                jonfinch francescaann



*   Combine overview, genres, directors and first 3 casts into one string
*   Create a bert model



In [77]:
#create soup2 for bert
unique_movies['soup2'] = unique_movies['overview'] + ' ' + unique_movies['processed_genres'] + ' ' + unique_movies['processed_directors'] + ' ' + unique_movies['processed_cast']

#create bert model
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = bert_model.encode(unique_movies["soup2"], show_progress_bar=True)

Batches:   0%|          | 0/163 [00:00<?, ?it/s]

In [6]:
print(unique_movies['Movie'].head(50))

0                  SENSATION
1     THE TRAGEDY OF MACBETH
2                  SHATTERED
3                 SEX APPEAL
4                       HURT
5                 CINDERELLA
6                   BETRAYED
7                      GREED
8               SILENT NIGHT
9              BOILING POINT
10              BLACK FRIDAY
11         THIS IS THE NIGHT
12                      BUGS
13                THE GRUDGE
14                 DANGEROUS
15                      DUNE
16           HELD FOR RANSOM
17                  JOY RIDE
18                 ISOLATION
19                WITCH HUNT
20                 IMPLANTED
21             ICE GUARDIANS
22                   STALKER
23                     HABIT
24              REMINISCENCE
25                THE COLONY
26         UNDER THE VOLCANO
27                  TOO LATE
28                   DEMONIC
29                       PIG
30                     TWIST
31                  THE EAST
32                      ROMA
33               THE PHANTOM
34            

**Create cosine scores for the matrix derived from the embeddings from the BERT model**

In [78]:
#calculate cosine similarity
cosine_scores_bert = cosine_similarity(embeddings)

# Flatten the cosine similarity matrix into a 1D array
cosine_scores_bert_flat = cosine_scores_bert.flatten()

**Determine the threshold for cosine similarity scores to decide whether to recommend or not.**

In [79]:
percentile_90 = np.percentile(cosine_scores_bert_flat, 90)
print(percentile_90)

0.7116745710372925


**Predict on train set**

In [80]:
#find all the liked movies in train set, for all similar movies > threshold, recommend
cbf_train = train[['Reviewer', 'Movie', 'recommend']]
train_liked_movies = cbf_train[cbf_train['recommend'] == 1].drop_duplicates(subset = ['Movie'])['Movie'].tolist()

all_similar_movies = []
for movie in train_liked_movies:
  curr_movie_index = unique_movies.index[unique_movies['Movie'] == movie][0]
  for index, element in enumerate(cosine_scores_bert[curr_movie_index]):
    if (element > percentile_90):
      sim_movie = unique_movies.iloc[index]['Movie']
      all_similar_movies.append(sim_movie)
all_similar_movies = list(set(all_similar_movies))

def assign_value(a):
  if a in all_similar_movies:
    return 1
  else:
    return 0

cbf_train['pred'] = cbf_train['Movie'].apply(lambda x: assign_value(x))

In [81]:
precision_cbf = precision_at_k(rating_pred = cbf_train[['Reviewer', 'Movie', 'pred']], rating_true = cbf_train[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'pred')
print(f'Precision: {precision_cbf}')

# Recall
recall_cbf = recall_at_k(rating_pred = cbf_train[['Reviewer', 'Movie', 'pred']], rating_true = cbf_train[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'pred')
print(f'Recall: {recall_cbf}')


#F1 score
f1_cbf = 2 * (precision_cbf * recall_cbf) / (precision_cbf + recall_cbf) if (precision_cbf + recall_cbf) > 0 else 0
print(f'F1 Score: {f1_cbf}')

Precision: 0.6851463521188292
Recall: 0.6384821968541985
F1 Score: 0.6609917085981266


**Predict on test set**

In [82]:
#find all the liked movies in test set, for all similar movies > threshold, recommend
cbf_test = test[['Reviewer', 'Movie', 'recommend']]
test_liked_movies = cbf_test[cbf_test['recommend'] == 1].drop_duplicates(subset = ['Movie'])['Movie'].tolist()

all_similar_movies = []
for movie in test_liked_movies:
  curr_movie_index = unique_movies.index[unique_movies['Movie'] == movie][0]
  for index, element in enumerate(cosine_scores_bert[curr_movie_index]):
    if (element > percentile_90):
      sim_movie = unique_movies.iloc[index]['Movie']
      all_similar_movies.append(sim_movie)
all_similar_movies = list(set(all_similar_movies))

def assign_value(a):
  if a in all_similar_movies:
    return 1
  else:
    return 0

cbf_test['pred'] = cbf_test['Movie'].apply(lambda x: assign_value(x))

In [83]:
#Precision
precision_cbf = precision_at_k(rating_pred = cbf_test[['Reviewer', 'Movie', 'pred']], rating_true = cbf_test[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'pred')
print(f'Precision: {precision_cbf}')

# Recall
recall_cbf = recall_at_k(rating_pred = cbf_test[['Reviewer', 'Movie', 'pred']], rating_true = cbf_test[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'pred')
print(f'Recall: {recall_cbf}')

#F1 score
f1_cbf = 2 * (precision_cbf * recall_cbf) / (precision_cbf + recall_cbf) if (precision_cbf + recall_cbf) > 0 else 0
print(f'F1 Score: {f1_cbf}')

Precision: 0.6667181467181468
Recall: 0.7160752207569212
F1 Score: 0.6905158143267566


**Recommending top 10 movies using content based filtering**

In [91]:
#content based filtering method
def recommend_movies(movie, cosine_scores, unique_movies, top_n=5):
    movie_index = unique_movies[unique_movies['Movie'] == movie].index[0]
    sim_scores = list(enumerate(cosine_scores[movie_index]))
    sim_scores.sort(key=lambda x: x[1], reverse=True)

    #get top n similar movies (excluding the input movie itself)
    top_movies = sim_scores[1:top_n + 1]

    #extract movie titles from indices
    recommended_movies = [(unique_movies.iloc[i]['Movie'], cosine_scores[movie_index][i]) for i, _ in top_movies]

    return recommended_movies

In [92]:
#test content based filtering
recommended_movies = recommend_movies('INTERSTELLAR', cosine_scores_bert, unique_movies)
print(pd.DataFrame(recommended_movies, columns = ['Movie', 'Cosine_Scores_Bert']))

                          Movie  Cosine_Scores_Bert
0                    PROMETHEUS            0.866442
1              STAR TREK BEYOND            0.856962
2               RATCHET & CLANK            0.841057
3       APPROACHING THE UNKNOWN            0.834513
4  ROGUE ONE: A STAR WARS STORY            0.828726


# **Collaborative filtering using SVD**

**Train SVD model and make predictions**

In [84]:
#prepare training set
rating_scale = (train['Score'].min(), train['Score'].max())
reader = Reader(rating_scale=rating_scale)
svd_train = Dataset.load_from_df(train[['Reviewer', 'Movie', 'Score']], reader).build_full_trainset()

In [85]:
#train svd
svd = SVD(random_state=42, n_factors=200, n_epochs=30, verbose=True)
svd.fit(svd_train)

testset = list(test[['Reviewer', 'Movie', 'Score']].apply(tuple, axis=1))
test_predictions = svd.test(testset)

trainset = list(train[['Reviewer', 'Movie', 'Score']].apply(tuple, axis = 1))
train_predictions = svd.test(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


**Predict on train set and test set**

In [88]:
#evaluation of SVD Model
def rating_grouping(rating):
    if (rating > 0.797980):
        return 1
    else:
        return 0

real_user_ratings_train = pd.DataFrame([(pred.uid, pred.iid, rating_grouping(pred.r_ui)) for pred in train_predictions], columns=['Reviewer', 'Movie', 'Actual_Rating'])
predicted_ratings_train = pd.DataFrame([(pred.uid, pred.iid, rating_grouping(pred.est)) for pred in train_predictions], columns=['Reviewer', 'Movie', 'Predicted_Rating'])
#extract true and predicted ratings from the predictions
real_user_ratings_test = pd.DataFrame([(pred.uid, pred.iid, rating_grouping(pred.r_ui)) for pred in test_predictions], columns=['Reviewer', 'Movie', 'Actual_Rating'])
predicted_ratings_test = pd.DataFrame([(pred.uid, pred.iid, rating_grouping(pred.est)) for pred in test_predictions], columns=['Reviewer', 'Movie', 'Predicted_Rating'])

# Precision
precision = precision_at_k(rating_pred = predicted_ratings_train, rating_true = real_user_ratings_train, k = 5, col_user='Reviewer', col_item='Movie', col_rating='Actual_Rating', col_prediction = 'Predicted_Rating')
print(f'Precision (train): {precision}')

# Recall
recall = recall_at_k(rating_pred = predicted_ratings_train, rating_true = real_user_ratings_train, k = 5, col_user='Reviewer', col_item='Movie', col_rating='Actual_Rating', col_prediction = 'Predicted_Rating')
print(f'Recall (train): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (train): {f1}')

# Precision
precision = precision_at_k(rating_pred = predicted_ratings_test, rating_true = real_user_ratings_test, k = 5, col_user='Reviewer', col_item='Movie', col_rating='Actual_Rating', col_prediction = 'Predicted_Rating')
print(f'Precision (test): {precision}')

# Recall
recall = recall_at_k(rating_pred = predicted_ratings_test, rating_true = real_user_ratings_test, k = 5, col_user='Reviewer', col_item='Movie', col_rating='Actual_Rating', col_prediction = 'Predicted_Rating')
print(f'Recall (test): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (test): {f1}')

Precision (train): 0.6855395369156838
Recall (train): 0.6384954590143511
F1 Score (train): 0.6611817401216108
Precision (test): 0.6664864864864865
Recall (test): 0.7160172031015732
F1 Score (test): 0.6903645806561904


# **Hybrid Recommendation System**

**Combine predictions**

In [89]:
new_training_set = np.column_stack([cbf_train['pred'], [pred.est for pred in train_predictions], cbf_train['recommend']])
new_test_set = np.column_stack([cbf_test['pred'], [pred.est for pred in test_predictions], cbf_test['recommend']])

**Train random forest classifier, svm classifier and mlp classifier**

In [90]:
rf_classifier = RandomForestClassifier(random_state = 42)
svm_classifier = LinearSVC(max_iter = 100, random_state=42)
mlp_classifier = MLPClassifier(hidden_layer_sizes=(128, 64), random_state = 42)
X_train = new_training_set[:, :-1]
y_train = new_training_set[:,-1]
X_test = new_test_set[:, :-1]
y_test = new_test_set[:, -1]
rf_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)
mlp_classifier.fit(X_train, y_train)

**Predict on train and test sets**

In [93]:
train_pred_rf = rf_classifier.predict(X_train)
test_pred_rf = rf_classifier.predict(X_test)

train_pred_rf = pd.DataFrame(train_pred_rf, columns=['Predicted_Label'])
test_pred_rf = pd.DataFrame(test_pred_rf, columns=['Predicted_Label'])

train_pred_rf = pd.concat([train, train_pred_rf], axis=1)
test_pred_rf = pd.concat([test, test_pred_rf], axis = 1)

train_pred_svm = svm_classifier.predict(X_train)
test_pred_svm = svm_classifier.predict(X_test)

train_pred_svm = pd.DataFrame(train_pred_svm, columns=['Predicted_Label'])
test_pred_svm = pd.DataFrame(test_pred_svm, columns=['Predicted_Label'])

train_pred_svm = pd.concat([train, train_pred_svm], axis=1)
test_pred_svm = pd.concat([test, test_pred_svm], axis = 1)

train_pred_mlp = mlp_classifier.predict(X_train)
test_pred_mlp = mlp_classifier.predict(X_test)

train_pred_mlp = pd.DataFrame(train_pred_mlp, columns=['Predicted_Label'])
test_pred_mlp = pd.DataFrame(test_pred_mlp, columns=['Predicted_Label'])

train_pred_mlp = pd.concat([train, train_pred_mlp], axis=1)
test_pred_mlp = pd.concat([test, test_pred_mlp], axis = 1)

In [94]:
# Precision
precision = precision_at_k(rating_pred = train_pred_rf[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = train_pred_rf[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Precision (train rf): {precision}')

# Recall
recall = recall_at_k(rating_pred = train_pred_rf[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = train_pred_rf[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Recall (train rf): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (train rf): {f1}')

# Precision
precision = precision_at_k(rating_pred = test_pred_rf[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = test_pred_rf[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Precision (test rf): {precision}')

# Recall
recall = recall_at_k(rating_pred = test_pred_rf[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = test_pred_rf[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Recall (test rf): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (test rf): {f1}')

print('------')

# Precision
precision = precision_at_k(rating_pred = train_pred_svm[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = train_pred_svm[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Precision (train svm): {precision}')

# Recall
recall = recall_at_k(rating_pred = train_pred_svm[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = train_pred_svm[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Recall (train svm): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (train svm): {f1}')

# Precision
precision = precision_at_k(rating_pred = test_pred_svm[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = test_pred_svm[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Precision (test svm): {precision}')

# Recall
recall = recall_at_k(rating_pred = test_pred_svm[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = test_pred_svm[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Recall (test svm): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (test svm): {f1}')

print('------')

# Precision
precision = precision_at_k(rating_pred = train_pred_mlp[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = train_pred_mlp[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Precision (train mlp): {precision}')

# Recall
recall = recall_at_k(rating_pred = train_pred_mlp[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = train_pred_mlp[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Recall (train mlp): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (train mlp): {f1}')

# Precision
precision = precision_at_k(rating_pred = test_pred_mlp[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = test_pred_mlp[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Precision (test mlp): {precision}')

# Recall
recall = recall_at_k(rating_pred = test_pred_mlp[['Reviewer', 'Movie', 'Predicted_Label']], rating_true = test_pred_mlp[['Reviewer', 'Movie', 'recommend']], k = 5, col_user='Reviewer', col_item='Movie', col_rating='recommend', col_prediction = 'Predicted_Label')
print(f'Recall (test mlp): {recall}')

#F1 score
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f'F1 Score (test mlp): {f1}')

Precision (train rf): 0.6849093688578293
Recall (train rf): 0.6382272351906111
F1 Score (train rf): 0.6607447961227674
Precision (test rf): 0.6664608259359321
Recall (test rf): 0.7158293212127711
F1 Score (test rf): 0.6902634756222406
------
Precision (train svm): 0.6849093688578293
Recall (train svm): 0.6382788338176132
F1 Score (train svm): 0.6607724469450429
Precision (test svm): 0.6663836356619066
Recall (test svm): 0.7158266222521408
F1 Score (test svm): 0.6902208174317094
------
Precision (train mlp): 0.6846036252456869
Recall (train mlp): 0.6382508177843174
F1 Score (train mlp): 0.6606151205424153
Precision (test mlp): 0.6663836356619066
Recall (test mlp): 0.7158266222521408
F1 Score (test mlp): 0.6902208174317094


**Demonstrate top 5 movie recommendations using hybrid recommendation system**

In [95]:
def hybrid_recommend(user):
  all_movies_liked = data[(data['Reviewer'] == user) & (data['recommend'] == 1)]['Movie'].tolist()
  all_similar_movies = []
  for movie in all_movies_liked:
    curr_movie_index = unique_movies.index[unique_movies['Movie'] == movie][0]
    for index, element in enumerate(cosine_scores_bert[curr_movie_index]):
      if (element > percentile_90):
        sim_movie = unique_movies.iloc[index]['Movie']
        all_similar_movies.append(sim_movie)
  all_similar_movies = list(set(all_similar_movies))
  svd_predictions = [svd.predict(user,x).est for x in all_similar_movies]
  all_similar_movies = pd.DataFrame(all_similar_movies, columns = ['Movie'])
  all_similar_movies['recommend'] = 1
  final = []
  stacked = np.column_stack([all_similar_movies['recommend'], svd_predictions])
  stacked_df = pd.DataFrame(stacked, columns = ['cbf_predictions', 'svd_predictions'])
  stacked_df['Movie'] = all_similar_movies['Movie']
  stacked_df = stacked_df.sort_values(by = 'svd_predictions', ascending = False)
  reccs = rf_classifier.predict(np.column_stack([stacked_df['cbf_predictions'], stacked_df['svd_predictions']]))
  for id, recc in enumerate(reccs):
    if recc == 1:
      final.append(stacked_df.iloc[id]['Movie'])
  final = pd.DataFrame(final[:5], columns = ['Movie'])
  final['Overview'] = final['Movie'].apply(lambda x: unique_movies[unique_movies['Movie'] == x]['overview'].tolist()[0])
  final['Directors'] = final['Movie'].apply(lambda x: unique_movies[unique_movies['Movie'] == x]['director_list'].tolist()[0])
  final['Cast'] = final['Movie'].apply(lambda x: unique_movies[unique_movies['Movie'] == x]['first_three_casts'].tolist()[0])
  final['Genres'] = final['Movie'].apply(lambda x: unique_movies[unique_movies['Movie'] == x]['unique_genres'].tolist()[0])
  return final

In [96]:
hybrid_recommend('Dennis Schwartz')

Unnamed: 0,Movie,Overview,Directors,Cast,Genres
0,KES,"A young, English working-class boy spends his ...",[Ken Loach],"[David Bradley, Freddie Fletcher, Lynne Perrie]",[Drama]
1,MODERN TIMES,The Tramp struggles to live in modern industri...,[Charlie Chaplin],"[Charlie Chaplin, Paulette Goddard, Henry Berg...","[Drama, Comedy]"
2,NO COUNTRY FOR OLD MEN,"Llewelyn Moss stumbles upon dead bodies, $2 mi...","[Joel Coen, Ethan Coen]","[Tommy Lee Jones, Javier Bardem, Josh Brolin]","[Drama, Crime, Thriller]"
3,TOY STORY 3,"Woody, Buzz, and the rest of Andy's toys haven...",[Lee Unkrich],"[Tom Hanks, Tim Allen, Ned Beatty]","[Animation, Comedy, Family]"
4,PARASITE,Paul Dean has created a deadly parasite that i...,[Charles Band],"[Robert Glaudini, Demi Moore, Luca Bercovici]","[Science Fiction, Horror]"
