In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import (cosine_similarity,
                                     euclidean_distances,
                                     cosine_distances,
                                     pairwise_distances)

from thefuzz import process
import fuzzyset

from scipy import sparse

In [4]:
# building a person to person recommender

#import the ratings dataframe
ratings = pd.read_csv('./data/ml-25m/ml-25m/ratings.csv')
ratings.drop(columns=['timestamp'], inplace=True)

#import the unique id/title for each film
movie_ids = pd.read_csv('./data/movies_with_review_id.csv')
movie_ids = movie_ids[['tconst', 'primary_title_x','movieId']]

#limiting the ratings df to only the films that are in our movies database
ratings = ratings[ratings['movieId'].isin(list(movie_ids['movieId']))]

#joining film titles onto the ratings df
ratings = pd.merge(ratings, movie_ids, on='movieId', how='left') 

ratings.rename(columns={'primary_title_x':'primary_title'}, inplace=True)

ratings.drop(columns=['primary_title', 'movieId'], inplace=True)

In [6]:
#transforming the ratings dataframe into the required format
ratings = pd.pivot_table(
    ratings, 
    values = 'rating',
    columns = 'userId',
    index = 'tconst')

movie_ids = 0

# ratings.reset_index(inplace=True)
# ratings.drop(columns=['userId'], inplace=True)

In [7]:
ratings.head()

userId,1,2,3,4,5,6,7,8,9,10,...,162532,162533,162534,162535,162536,162537,162538,162539,162540,162541
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0033373,,,,,,,,,,,...,,,,,,,,,,
tt0033467,,,,,,,,3.0,,,...,,,,,,,,,,
tt0033553,,,,,,,,,,,...,,,,,,,,,,
tt0033563,,,,,,,,3.0,3.0,,...,,4.0,,,,,,,,3.5
tt0033717,,,,,,,,,,,...,,,,,,,,,,


In [8]:
#creating a sparse matrix
sparse = sparse.csr_matrix(ratings.fillna(0))
print(sparse)

  (0, 30)	3.5
  (0, 802)	5.0
  (0, 2176)	5.0
  (0, 3974)	3.5
  (0, 4827)	4.0
  (0, 5571)	4.0
  (0, 5864)	3.5
  (0, 5989)	5.0
  (0, 7216)	4.0
  (0, 7287)	3.0
  (0, 8071)	3.5
  (0, 8618)	4.5
  (0, 8753)	3.5
  (0, 9129)	4.0
  (0, 9761)	4.5
  (0, 9946)	3.0
  (0, 10280)	5.0
  (0, 11538)	4.0
  (0, 12375)	3.5
  (0, 12592)	4.5
  (0, 12856)	3.0
  (0, 14374)	3.5
  (0, 14553)	3.0
  (0, 14921)	3.5
  (0, 17002)	4.0
  :	:
  (7991, 160920)	4.0
  (7991, 161035)	3.0
  (7991, 161045)	2.5
  (7991, 161155)	4.0
  (7991, 161161)	3.0
  (7991, 161163)	4.0
  (7991, 161182)	2.5
  (7991, 161221)	3.0
  (7991, 161381)	4.0
  (7991, 161419)	4.0
  (7991, 161542)	1.5
  (7991, 161573)	3.5
  (7991, 161640)	4.0
  (7991, 161723)	3.5
  (7991, 161908)	3.5
  (7991, 161937)	3.5
  (7991, 162045)	3.5
  (7991, 162169)	5.0
  (7991, 162249)	3.0
  (7991, 162269)	3.0
  (7991, 162357)	2.0
  (7991, 162431)	2.5
  (7991, 162506)	4.5
  (7992, 40013)	5.0
  (7992, 134914)	5.0


In [9]:
dists = pairwise_distances(sparse, metric='cosine')

In [10]:
recommender_df = pd.DataFrame(dists, columns=ratings.index, index=ratings.index)

In [11]:
recommender_df

tconst,tt0033373,tt0033467,tt0033553,tt0033563,tt0033717,tt0033729,tt0033836,tt0033870,tt0033891,tt0033922,...,tt9419834,tt9426210,tt9495224,tt9541602,tt9617456,tt9742794,tt9775360,tt9806192,tt9820556,tt9900782
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0033373,0.000000,0.913985,0.883199,0.930917,0.821282,0.841040,0.754090,0.897476,0.828196,0.873327,...,0.986538,1.000000,0.976681,1.000000,1.000000,0.994842,0.982968,1.0,0.901601,1.0
tt0033467,0.913985,0.000000,0.917602,0.746670,0.869842,0.822533,0.892177,0.543265,0.819736,0.910934,...,0.993474,0.992564,0.949446,0.996330,0.994727,0.988472,0.994593,1.0,0.890349,1.0
tt0033553,0.883199,0.917602,0.000000,0.913680,0.808388,0.890935,0.843562,0.909673,0.882522,0.889686,...,1.000000,1.000000,0.987702,1.000000,1.000000,1.000000,0.986667,1.0,0.945470,1.0
tt0033563,0.930917,0.746670,0.913680,0.000000,0.911847,0.857695,0.912489,0.766583,0.850839,0.912713,...,0.987982,0.993918,0.954589,0.998228,0.994550,0.970728,0.984752,1.0,0.921433,1.0
tt0033717,0.821282,0.869842,0.808388,0.911847,0.000000,0.839461,0.819890,0.821025,0.848386,0.896160,...,1.000000,1.000000,0.983963,1.000000,1.000000,1.000000,0.986224,1.0,0.923696,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9742794,0.994842,0.988472,1.000000,0.970728,1.000000,0.992190,0.997565,0.995019,1.000000,1.000000,...,0.929193,1.000000,0.945918,0.951813,1.000000,0.000000,0.941573,1.0,0.958331,1.0
tt9775360,0.982968,0.994593,0.986667,0.984752,0.986224,0.992614,0.981905,0.993793,1.000000,0.982924,...,0.950985,1.000000,0.959572,1.000000,1.000000,0.941573,0.000000,1.0,0.981384,1.0
tt9806192,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.0,1.000000,1.0
tt9820556,0.901601,0.890349,0.945470,0.921433,0.923696,0.941994,0.941594,0.907169,0.947349,0.949162,...,0.989482,0.994197,0.965520,0.992683,1.000000,0.958331,0.981384,1.0,0.000000,1.0


In [16]:
print(recommender_df['tt0325980'].sort_values()[1:11])

tconst
tt0167261    0.348581
tt0167260    0.349852
tt0266543    0.360070
tt0120737    0.363365
tt0126029    0.369767
tt0145487    0.390792
tt0198781    0.406938
tt0317705    0.407275
tt0383574    0.417227
tt0372784    0.426718
Name: tt0325980, dtype: float64


In [12]:
movie_ids = pd.read_csv('./data/movies_with_review_id.csv')
movie_ids = movie_ids[['tconst', 'primary_title_x','movieId']]

In [21]:
movie_ids[movie_ids['tconst'] == 'tt0198781']

Unnamed: 0,tconst,primary_title_x,movieId
66,tt0198781,"Monsters, Inc.",4886


In [22]:
recommender_df.to_csv('./data/review_based_recommender_df.csv', index=False)

In [23]:
recommender_df = pd.read_csv('./data/review_based_recommender_df.csv')