In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [3]:
import csv
import numpy as np
import pandas as pd

In [4]:
movies_metadata = pd.read_csv('drive/MyDrive/machine learning/finalproject/movies_metadata.csv')
keywords = pd.read_csv('drive/MyDrive/machine learning/finalproject/keywords.csv')
credits = pd.read_csv('drive/MyDrive/machine learning/finalproject/credits.csv')
ratings_small = pd.read_csv('drive/MyDrive/machine learning/finalproject/ratings_small.csv')
links_small = pd.read_csv('drive/MyDrive/machine learning/finalproject/links_small.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
movies_df = pd.DataFrame()

# delete invalid ids
movies = movies_metadata[pd.to_numeric(movies_metadata['id'], errors='coerce').notnull()]
movies = movies[movies['title'].notna()]

In [7]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies['id'] = movies['id'].astype('int')

In [21]:
def userRatings(r, userIds):
  return r[r['userId'].isin(userIds)]
def getMovie(r, movieIds):
  return r[r['movieId'].isin(movieIds)]
def getMovieTitle(id):
  return movies[movies['id'] == id][['title']]


In [None]:
def canPredict(movieId):
  ratedMovies = ratings_small[ratings_small['movieId'].isin(movies['id'])]['movieId'].unique()
  return (movieId in ratedMovies)

canPredict(1371)
getMovieTitle(1371)
getMovie(ratings_small, [1371])

In [10]:
from fuzzywuzzy import fuzz

def fuzzy_matching(mapper, fav_book, verbose=True):
    matched_items = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_book.lower())
        if ratio >= 60:
            matched_items.append((title, idx, ratio))
    # sort
    matched_items = sorted(matched_items, key=lambda x: x[2])[::-1]
    if not matched_items:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in matched_items]))
    return matched_items[0][1]



In [11]:
from sklearn.neighbors import NearestNeighbors

class MovieRecommender:
  def __init__(self, ratings_mx, movies_idx):
    self.ratings_mx = ratings_mx
    self.movies_idx = movies_idx;
    self.knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    self.knn.fit(ratings_mx)

  def findSimiliarMovies(self, target_movie, rec_cnt):
    
    idx = fuzzy_matching(self.movies_idx, target_movie, verbose=True)
    distances, indices = self.knn.kneighbors(self.ratings_mx[idx], n_neighbors=rec_cnt+1)

    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]

    movie_titles = {v:k for k,v in self.movies_idx.items()}
    results = []
    result_idx = []
    for i, (idx, dist) in enumerate(raw_recommends):
      if not idx in movie_titles.keys():
        print("movie with id {0} is not available: ".format(idx))
        continue
      print('{0}: {1}, with distance of {2}'.format(i+1, movie_titles[idx], dist))
      results.append(movie_titles[idx])
      result_idx.append(idx)
    return results, result_idx


  def recommendMovie(self, movie_title, count):
    recomms, ids = self.findSimiliarMovies(movie_title, count)
    return recomms, ids




In [12]:
def getInfo(movies):
  return movies.drop(columns=['budget', 'homepage',
       'imdb_id', 'original_language',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'video'], inplace=False)


In [None]:
# TODO add original titles 
movies[['id', 'original_title']]
movies[movies['original_title'] != movies['title']]

In [13]:
from scipy.sparse import csr_matrix

ratings_pivot = ratings_small.pivot(index='movieId', columns='userId', values='rating').fillna(0)
ratings_mx = csr_matrix(ratings_pivot.values)

movie_titles = movies[['id', 'title']]
movies_idx = pd.Series(movies['id'].values, index=movies['title']).astype('int64')

recommender = MovieRecommender(ratings_mx, movies_idx)
title = "Four Rooms"
recomms, ids = recommender.recommendMovie(title, 10)
print("recoms : \n", recomms)
getInfo(movies[movies['id'].isin(ids)])

Found possible matches in our database: ['Four Rooms', 'Four of Us', 'Four Rode Out', 'Four Times', 'Four Lions', 'Four Brothers', 'War Room', 'Four Lovers', 'Powder Room', 'Boiler Room', 'Shared Rooms', 'Four Mothers', 'Four Minutes', 'Control Room', 'Four Friends', 'Our Times', 'Our Curse', 'Four Sons', 'Our Folks', 'Four Sons', 'Our Music', 'Home Room', 'Four Horsemen', 'Far from Home', 'Our Relations', "Fermat's Room", 'Shoulder Arms', 'Major Grom', 'Our Lovers', 'Hotel Room', 'Green Room', 'Your Honor', 'Four Wives', "Leo's Room", 'Mr. Brooks']

1: The Passion of the Christ, with distance of 0.4742419566942303
2: Kill Bill: Vol. 1, with distance of 0.49513058155839584
movie with id 31 is not available: 
4: Boyz n the Hood, with distance of 0.5325538642689402
5: A.I. Artificial Intelligence, with distance of 0.53482532717979
6: Indiana Jones and the Temple of Doom, with distance of 0.5383533091562768
7: Flashdance, with distance of 0.539475353930275
8: Wild Things, with distance of

Unnamed: 0,adult,belongs_to_collection,genres,id,original_title,overview,tagline,title,vote_average,vote_count
1704,False,"{'id': 33059, 'name': 'Wild Things Collection'...","[{'id': 53, 'name': 'Thriller'}]",617,Wild Things,When teen-socialite Kelly Van Ryan (Richards) ...,They're dying to play with you.,Wild Things,6.3,454.0
2006,False,"{'id': 84, 'name': 'Indiana Jones Collection',...","[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",87,Indiana Jones and the Temple of Doom,"After arriving in India, Indiana Jones is aske...",If adventure has a name... it must be Indiana ...,Indiana Jones and the Temple of Doom,7.1,2841.0
2826,False,,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",535,Flashdance,The popular 1980’s dance movie that depicts th...,When the dancer becomes the dance.,Flashdance,6.1,313.0
4242,False,,"[{'id': 18, 'name': 'Drama'}, {'id': 878, 'nam...",644,A.I. Artificial Intelligence,"A robotic boy, the first programmed to love, D...",David is 11 years old. He weighs 60 pounds. He...,A.I. Artificial Intelligence,6.8,2011.0
6647,False,,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",650,Boyz n the Hood,Boyz n the Hood is the popular and successful ...,Once upon a time in South Central L.A... It ai...,Boyz n the Hood,7.4,377.0
6725,False,"{'id': 2883, 'name': 'Kill Bill Collection', '...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",24,Kill Bill: Vol. 1,An assassin is shot at the altar by her ruthle...,Go for the kill.,Kill Bill: Vol. 1,7.7,5091.0
7165,False,,"[{'id': 18, 'name': 'Drama'}]",615,The Passion of the Christ,"""The Passion of the Christ"" is a film about th...","By his wounds, we were healed.",The Passion of the Christ,6.9,888.0
9709,False,"{'id': 437628, 'name': 'Constantine Collection...","[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",561,Constantine,John Constantine has literally been to Hell an...,"Hell Wants Him, Heaven Won't Take Him, Earth N...",Constantine,6.6,1837.0


### Analyzing ratings

In [20]:
ratings = ratings_small.drop_duplicates(subset=['userId', 'movieId'], keep=False, inplace=False )
ratings = ratings.groupby(['movieId'], as_index=False) \
  .agg({'userId':'size', 'rating':'mean'}) \
  .rename(columns={'userId':'ratings_cnt', 'rating':'rating_avg'}) \
  .reset_index()[['movieId', 'rating_avg', 'ratings_cnt']]
ratings

Unnamed: 0,movieId,rating_avg,ratings_cnt
0,1,3.872470,247
1,2,3.401869,107
2,3,3.161017,59
3,4,2.384615,13
4,5,3.267857,56
...,...,...,...
9061,161944,5.000000,1
9062,162376,4.500000,1
9063,162542,5.000000,1
9064,162672,3.000000,1
