In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

def convert_to_float(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('float64')

def to_json(csv_entry):
    return json.loads(re.sub('\'', '"', csv_entry))

In [4]:
md = pd.read_csv('../data/movies_metadata.csv'
                , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                ,usecols=['id', 'original_title', 'budget', 'genres','spoken_languages', 'title','release_date','vote_count','vote_average'])
md.drop_duplicates(subset ="id", keep = 'first', inplace = True)
md = md[md.spoken_languages == """[{'iso_639_1': 'en', 'name': 'English'}]"""]

In [5]:
credits = pd.read_csv('../data/credits.csv')
credits.drop_duplicates(subset ="id", keep = 'first', inplace = True)
keywords = pd.read_csv('../data/keywords.csv')
keywords.drop_duplicates(subset ="id", keep = 'first', inplace = True)

In [6]:
md.shape

(22381, 9)

In [7]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
#md = md.drop([19730, 29503, 35587])
#md = md.drop([19730])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [8]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [9]:
print(md['id'].nunique(),md['original_title'].nunique(),credits['id'].nunique(),keywords['id'].nunique())

22381 21342 45432 45432


In [10]:
md = md.merge(credits,how='inner', left_on='id', right_on='id')
md = md.merge(keywords,how='inner', left_on='id', right_on='id')

In [11]:
print(md['id'].nunique(),md['title'].nunique(),credits['id'].nunique(),keywords['id'].nunique())

22381 21330 45432 45432


In [12]:
md['cast'] = md['cast'].apply(literal_eval)
md['crew'] = md['crew'].apply(literal_eval)
md['keywords'] = md['keywords'].apply(literal_eval)
md['cast_size'] = md['cast'].apply(lambda x: len(x))
md['crew_size'] = md['crew'].apply(lambda x: len(x))

In [13]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [14]:
md['director'] = md['crew'].apply(get_director)

In [15]:
md['cast'] = md['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['cast'] = md['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

md['keywords'] = md['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

md['cast'] = md['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

md['director'] = md['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
md['director'] = md['director'].apply(lambda x: [x,x, x])

In [16]:
s = md.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

s = s.value_counts()
s = s[s > 1]

In [19]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [20]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [21]:
md['keywords'] = md['keywords'].apply(filter_keywords)
md['keywords'] = md['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [22]:
md['soup'] = md['keywords'] + md['cast'] + md['director'] + md['genres']
md['soup'] = md['soup'].apply(lambda x: ' '.join(x))

In [23]:
md = md.sort_values(by='id')
md = md.reset_index()
titles = md['title']
indices = pd.Series(md.index, index=md['title'])

In [24]:
md.head()

Unnamed: 0,index,budget,genres,id,original_title,release_date,spoken_languages,title,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director,soup
0,10,4000000,"[Crime, Comedy]",5,Four Rooms,1995-12-09,"[{'iso_639_1': 'en', 'name': 'English'}]",Four Rooms,6.5,539.0,1995,"[timroth, antoniobanderas, jenniferbeals]","[{'credit_id': '52fe420dc3a36847f800011b', 'de...","[hotel, newyear'sev, witch, bet, hotelroom, sp...",24,88,"[allisonanders, allisonanders, allisonanders]",hotel newyear'sev witch bet hotelroom sperm lo...
1,323,0,"[Action, Thriller, Crime]",6,Judgment Night,1993-10-15,"[{'iso_639_1': 'en', 'name': 'English'}]",Judgment Night,6.4,79.0,1993,"[emilioestevez, cubagoodingjr., denisleary]","[{'credit_id': '52fe420dc3a36847f800023d', 'de...","[chicago, drugdeal, boxingmatch, escap, onenight]",15,16,"[stephenhopkins, stephenhopkins, stephenhopkins]",chicago drugdeal boxingmatch escap onenight em...
2,168,11000000,"[Adventure, Action, Science Fiction]",11,Star Wars,1977-05-25,"[{'iso_639_1': 'en', 'name': 'English'}]",Star Wars,8.1,6778.0,1977,"[markhamill, harrisonford, carriefisher]","[{'credit_id': '52fe420dc3a36847f8000437', 'de...","[android, galaxi, hermit, deathstar, lightsab,...",106,20,"[georgelucas, georgelucas, georgelucas]",android galaxi hermit deathstar lightsab jedi ...
3,3976,94000000,"[Animation, Family]",12,Finding Nemo,2003-05-30,"[{'iso_639_1': 'en', 'name': 'English'}]",Finding Nemo,7.6,6292.0,2003,"[albertbrooks, ellendegeneres, alexandergould]","[{'credit_id': '52fe420ec3a36847f80006b1', 'de...","[fathersonrelationship, harbor, underwat, grea...",24,104,"[andrewstanton, andrewstanton, andrewstanton]",fathersonrelationship harbor underwat greatbar...
4,233,55000000,"[Comedy, Drama, Romance]",13,Forrest Gump,1994-07-06,"[{'iso_639_1': 'en', 'name': 'English'}]",Forrest Gump,8.2,8147.0,1994,"[tomhanks, robinwright, garysinise]","[{'credit_id': '52fe420ec3a36847f800072d', 'de...","[vietnamveteran, hippi, mentallydis, run, base...",66,131,"[robertzemeckis, robertzemeckis, robertzemeckis]",vietnamveteran hippi mentallydis run basedonno...


In [46]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(md['soup'])

In [47]:
count_matrix

<22381x198063 sparse matrix of type '<class 'numpy.int64'>'
	with 444820 stored elements in Compressed Sparse Row format>

In [48]:
from scipy import sparse

sparse.save_npz("countmatrix.npz", count_matrix)


In [49]:
# '''Content Filtering results without taking IMDB votings into account'''
# def get_recommendations(title):
#     idx = indices[title]
#     cosine_sim = linear_kernel(count_matrix[idx], count_matrix)
#     sim_scores = list(enumerate(cosine_sim[0]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:31]
#     movie_indices = [i[0] for i in sim_scores]
#     return titles.iloc[movie_indices]

# get_recommendations('The Dark Knight').head(10)

In [50]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.158527322282293

In [51]:
sim_movie_list = {}

In [52]:
def cosine_sim(count_matrix, idx, title):
    if(title in sim_movie_list):
        #print('call from here')
        return sim_movie_list[title]
    else:
        cosine_sim = linear_kernel(count_matrix[idx], count_matrix)
        sim_movie_list[title]= cosine_sim[0]
        return cosine_sim[0]

def improved_recommendations(title):
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim(count_matrix, idx, title)))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    print(movie_indices)
    movies = md.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.50)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

def weighted_rating(x):
    m = vote_counts.quantile(0.60)
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [55]:
improved_recommendations('Inception')

[17160, 10929, 21, 119, 377, 2642, 16733, 12119, 208, 11168, 21656, 185, 477, 628, 638, 763, 1433, 1447, 1733, 3484, 10326, 10644, 12238, 13303, 15183]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,title,vote_count,vote_average,year,wr
17160,Interstellar,11187,8,2014,7.995182
377,The Prestige,4510,8,2006,7.988079
21,Memento,4168,8,2000,7.987106
10929,The Dark Knight Rises,9263,7,2012,6.996231
12119,Looper,4777,6,2012,5.996666
628,X-Men Origins: Wolverine,4086,6,2009,5.996105
208,The Matrix Reloaded,3500,6,2003,5.995457
477,The Island,1813,6,2005,5.991273
185,Starship Troopers,1584,6,1997,5.990026
1733,Mad Max,1235,6,1979,5.98725
