In [1]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
import nltk
nltk.download('popular')

In [4]:
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
movies_metadata = pd.read_csv('drive/MyDrive/machine learning/finalproject/movies_metadata.csv')
keywords = pd.read_csv('drive/MyDrive/machine learning/finalproject/keywords.csv')
credits = pd.read_csv('drive/MyDrive/machine learning/finalproject/credits.csv')
ratings_small = pd.read_csv('drive/MyDrive/machine learning/finalproject/ratings_small.csv')
links_small = pd.read_csv('drive/MyDrive/machine learning/finalproject/links_small.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [49]:
movies_df = pd.DataFrame()

# delete invalid ids
movies = movies_metadata[pd.to_numeric(movies_metadata['id'], errors='coerce').notnull()]
movies = movies[movies['title'].notna()]
#movies = movies.dropna(axis=0, thresh=20)

movies['id'] = movies['id'].astype('int')
movies['budget'] = movies['budget'].astype('int')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

movies = movies[movies['budget'] > 0]

In [7]:
def CanUseCF(movie_title):
  movieId = getMovieId(movie_title)
  if movieId == -1:
    return False
  ratedMovies = ratings_small[ratings_small['movieId'].isin(movies['id'])]['movieId'].unique()
  return (movieId in ratedMovies)

def userRatings(userIds):
  return ratings_small[ratings_small['userId'].isin(userIds)]
def getRatedMovies(movieIds):
  return ratings_small[ratings_small['movieId'].isin(movieIds)]
def getMovieTitle(id):
  return movies[movies['id'] == id][['title']]
def getMovieId(title):
  ids = movies[movies['title'] == title][['id']].values
  if len(ids) > 0:
    return ids[0][0]
  else:
    return -1

In [16]:
getMovieId("The Dark Knight")
movies[movies['id'].isin([155, 72003])]
getMovieTitle(231293)
getMovieId("Planet Earth")

-1

In [9]:
from fuzzywuzzy import fuzz

def fuzzy_matching(mapper, fav_book, verbose=True):
    matched_items = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_book.lower())
        if ratio >= 60:
            matched_items.append((title, idx, ratio))
    # sort
    matched_items = sorted(matched_items, key=lambda x: x[2])[::-1]
    if not matched_items:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in matched_items]))
    return matched_items[0][1]



In [38]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

class CFRecommender:
  def __init__(self, ratings, movies):
    ratings_pivot = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    self.ratings_mx = csr_matrix(ratings_pivot.values)
    self.movies_idx = pd.Series(movies['id'].values, index=movies['title']).astype('int64')
    self.knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
    self.knn.fit(self.ratings_mx)

  def findSimiliarMovies(self, target_movie, rec_cnt):
    
    idx = fuzzy_matching(self.movies_idx, target_movie, verbose=True)
    distances, indices = self.knn.kneighbors(self.ratings_mx[idx], n_neighbors=rec_cnt+1)

    raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]
    #print("raw recomms : ", raw_recommends)

    movie_titles = {v:k for k,v in self.movies_idx.items()}
    results = []
    result_idx = []
    result_tuple = []
    for i, (idx, dist) in enumerate(raw_recommends):
      if not idx in movie_titles.keys():
        #print("movie with id {0} is not available: ".format(idx))
        continue
      #print('{0}: {1}, with distance of {2}'.format(i+1, movie_titles[idx], dist))
      #results.append(movie_titles[idx])
      #result_idx.append(idx)
      result_tuple.append((idx, movie_titles[idx], dist))
    return result_tuple


  def recommendMovie(self, movie_title, count):
    result = pd.DataFrame(columns=['title', 'dist'])
    if CanUseCF(movie_title):
      recomms = self.findSimiliarMovies(movie_title, count)
      result['title'] = [title for id, title, dist in recomms]
      result['dist'] = [dist for id, title, dist in recomms]
      #movie_titles = {v:k for k,v in self.movies_idx.items()}
    return result


In [11]:
def getInfo(movies):
  return movies.drop(columns=['budget', 'homepage',
       'imdb_id', 'original_language',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'video'], inplace=False)


In [48]:
recommender = CFRecommender(ratings_small, movies)
#title = "The Dark Knight"
title = "Four Rooms"
recomms = recommender.recommendMovie(title, 10)
print("recoms : \n")
recomms

Found possible matches in our database: ['Four Rooms', 'Four Lions', 'Four Brothers', 'War Room', 'Boiler Room', 'Four Minutes', 'Our Relations', 'Our Lovers', 'Green Room', 'Mr. Brooks']

recoms : 



Unnamed: 0,title,dist
0,The Passion of the Christ,0.474242
1,Kill Bill: Vol. 1,0.495131
2,Boyz n the Hood,0.532554
3,A.I. Artificial Intelligence,0.534825
4,Indiana Jones and the Temple of Doom,0.538353
5,Flashdance,0.539475
6,Wild Things,0.542851
7,Constantine,0.547751


In [12]:
def cb_preprocess(movies, keywords, credits):
  # movie_meta_data = movie_meta_data.drop([19730, 29503, 35587])

  movies_data = movies.drop(['adult','homepage','video','runtime','revenue','release_date','imdb_id','budget','poster_path','original_title',
                              'original_language','belongs_to_collection','production_countries'],axis=1)

  df_k_c = keywords.merge(credits,on="id",how="left")
  movies_data = df_k_c.merge(movies_data,on="id",how="left")

  movies_data['genres'] = movies_data['genres'].fillna('[]').apply(lambda x: [i['name'] for i in eval(x)])
  movies_data['genres'] = movies_data['genres'].apply(lambda x: ' '.join([i.replace(",", "") for i in x]))
  movies_data['production_companies'] = movies_data['production_companies'].fillna('[]').apply(lambda x: [i['name'] for i in eval(x)] if isinstance(eval(x), list) else [])
  movies_data['production_companies'] = movies_data['production_companies'].apply(lambda x: ' '.join([i.replace(",", "") for i in x]))
  movies_data['keywords'] = movies_data['keywords'].fillna('[]').apply(lambda x: [i['name'] for i in eval(x)] if isinstance(eval(x), list) else [])
  movies_data['keywords'] = movies_data['keywords'].apply(lambda x: ' '.join([i.replace(",", "") for i in x]))
  movies_data['cast'] = movies_data['cast'].fillna('[]').apply(lambda x: [i['name'] for i in eval(x)] if isinstance(eval(x), list) else [])
  movies_data['cast'] = movies_data['cast'].apply(lambda x: ' '.join([i.replace(",", "") for i in x]))
  movies_data['crew'] = movies_data['crew'].fillna('[]').apply(lambda x: [i['name'] for i in eval(x) if i['job'] == 'Director'] if isinstance(eval(x), list) else [])
  movies_data['crew'] = movies_data['crew'].apply(lambda x: ' '.join([i.replace(",", "") for i in x]))
  movies_data['spoken_languages'] = movies_data['spoken_languages'].fillna('[]').apply(lambda x: [i['name'] for i in eval(x)] if isinstance(eval(x), list) else [])
  movies_data['spoken_languages'] = movies_data['spoken_languages'].apply(lambda x: ' '.join([i.replace(",", "") for i in x]))
  movies_data['overview'] = movies_data['overview'].fillna('')
  movies_data['tagline'] = movies_data['tagline'].fillna('')

  movies_des=pd.DataFrame()
  movies_des['title']=movies_data['title']
  movies_des['vote_average']=movies_data['vote_average']
  movies_des['vote_count']=movies_data['vote_count']
  movies_des['data']=movies_data['genres'] +' '+movies_data['overview']+' '+movies_data['production_companies'] + ' ' +movies_data['title'] +' '+ movies_data['tagline']+' '+movies_data['crew'] +' '+ movies_data['keywords']
  movies_des['data']=movies_des['data'].str.lower()
  movies_des.drop_duplicates(inplace=True)
  movies_des=movies_des.dropna()
  pst = PorterStemmer()
  movies_des['data'] = movies_des['data'].apply(word_tokenize)
  movies_des['data'] = movies_des['data'].apply(lambda x: ' '.join([pst.stem(y) for y in x]))
  movies_des=movies_des.reset_index()
  movies_des=movies_des.drop("index" , axis=1)
  
  return movies_des

In [17]:
class CBRecommender :
    def __init__(self, movies, keywords, credits):
        self.movies = cb_preprocess(movies, keywords, credits)
        self.tfid_vector = self.vectorize()
    
    
    def vectorize(self):
        tfid = TfidfVectorizer(analyzer='word',stop_words='english')
        tfid_vector = tfid.fit_transform(self.movies['data'])
        return tfid_vector
    
    
    def recommendMovie(self, movie_title, count) :
        indx = self.movies[self.movies["title"] == movie_title].index
        sim_vector = linear_kernel(self.tfid_vector, self.tfid_vector[indx[0]])
        same_rate = sorted(list(enumerate(sim_vector)), key=lambda x: x[1], reverse=True)
        same_rate = same_rate[1:30]
        movie_indices = [i[0] for i in same_rate]
        similarities = [i[1] for i in same_rate]
        result = self.movies.iloc[movie_indices]
        result.loc[:, 'dist'] = 1 - np.array(similarities)
        # result['score'] = result['vote_average'] + (result['vote_count']//1000)/10
        # result.sort_values(by=['score'], ascending=False, inplace=True)
        #result.sort_values(by=["vote_average","vote_count"], ascending=[False,False], inplace=True)  
        result = result.drop(['data', 'vote_average', 'vote_count'],axis=1).reset_index(drop=True)
        return result[:count]

cb_recommender = CBRecommender(movies, keywords, credits)
result = cb_recommender.recommendMovie("Four Rooms", 10)
result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,title,dist
0,Insane,0.811546
1,1408,0.813021
2,The Million Dollar Hotel,0.816458
3,Out Of Order,0.817269
4,The Second Best Exotic Marigold Hotel,0.819849
5,New Year's Eve,0.827699
6,Ploy,0.838345
7,Grand Hotel,0.847683
8,Hotel Transylvania 2,0.84919
9,Mystery Train,0.852349


In [28]:
class EnsembleRecommender:
  def __init__(self, ratings, movies, keywords, credits):
    self.cfr = CFRecommender(ratings, movies)
    self.cbr = CBRecommender(movies, keywords, credits)

  
  def recommendMovie(self, movie_title, count):
    cf_results = self.cfr.recommendMovie(movie_title, count)
    print("cf results : ", cf_results)
    cb_results = cb_recommender.recommendMovie(movie_title, count)
    print("cb results : ", cb_results)
    results = pd.concat([cf_results, cb_results]).sort_values(by=['dist'])[:count].reset_index(drop=True)
    print("results: ", results)
    
    return results

In [29]:
ensembleRec = EnsembleRecommender(ratings_small, movies, keywords, credits)

In [30]:
rec_movies = ensembleRec.recommendMovie("The Dark Knight", 10)
rec_movies

Found possible matches in our database: ['The Dark Knight', 'One Dark Night', 'The Dark Knight Rises', 'Shark Night', 'The Dark Lurking', 'The Good Night', 'The Dark Half', 'The Dark Tower', 'The Dark Tapes', 'The Kids Are Alright', 'The Last Light', 'The Dark House', 'The Dark Hours', 'The Darkness', 'The Last Flight', 'The Master Gunfighter', 'The Dead Pit', 'The Hollywood Knights', 'In the Dark Half', 'The Kids Are All Right', 'The Dark Crystal', 'The Last King', 'The Nightmare', 'The Rig', 'The Awakening', 'Date Night', 'Harlem Nights', 'Hard Eight', 'The Hateful Eight', 'The Fighter', 'The Karate Kid', 'The Damned United', 'Into the Night', 'The Pianist', 'The Burning', 'The Karate Kid', 'The Big Hit', 'The Odd Angry Shot', 'The Rite', 'The Heartbreak Kid', 'Chimes at Midnight', 'The Ring', "A Hard Day's Night", 'The Danish Girl', 'The Dark Valley', 'The Monkey King', 'The Fisher King', 'The Night Flier', 'The Deer Hunter', 'The Right Stuff', 'The Frighteners']

cf results :      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,title,dist
0,Heat,0.387918
1,Pink Flamingos,0.446438
2,The Dark Knight Rises,0.459963
3,Donnie Darko,0.498209
4,The Good German,0.510471
5,Batman Begins,0.51407
6,Shaft,0.516186
7,Scoop,0.529562
8,"Batman: The Dark Knight Returns, Part 2",0.559047
9,"Batman: The Dark Knight Returns, Part 1",0.600064


In [46]:
ratedMovies = ratings_small[ratings_small['movieId'].isin(movies['id'])]['movieId'].unique()
print(ratedMovies)
print(len(ratedMovies))

[1371 1405 2105 ...  167  563  129]
1262
