# Demographic Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
meta = pd.read_csv('movies_metadata.csv',low_memory=False)

In [3]:
vote_data = meta[['original_title','vote_average','vote_count']]

In [4]:
vote_data['vote_average'].mean()

5.618207215133889

In [5]:
def weighted_rating(row):
    v = row['vote_count']
    m = 1500
    r = row['vote_average']
    c = 5.6
    wr = ((v / (v+m))*r) + ((m/(v+m))*c)
    return wr
    

In [6]:
vote_data['score'] = vote_data.apply(weighted_rating,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
vote_data.sort_values(by='score',ascending=False).head(10)

Unnamed: 0,original_title,vote_average,vote_count,score
314,The Shawshank Redemption,8.5,8358.0,8.058734
12481,The Dark Knight,8.3,12269.0,8.005861
2843,Fight Club,8.3,9678.0,7.937681
834,The Godfather,8.5,6024.0,7.92185
292,Pulp Fiction,8.3,8670.0,7.90177
15480,Inception,8.1,14075.0,7.85923
22879,Interstellar,8.1,11187.0,7.804422
351,Forrest Gump,8.2,8147.0,7.795729
7000,The Lord of the Rings: The Return of the King,8.1,8226.0,7.714436
1154,The Empire Strikes Back,8.2,5998.0,7.679861


In [8]:
vote_data.sort_values(by='vote_count',ascending=False).head(10)

Unnamed: 0,original_title,vote_average,vote_count,score
15480,Inception,8.1,14075.0,7.85923
12481,The Dark Knight,8.3,12269.0,8.005861
14551,Avatar,7.2,12114.0,7.023711
17818,The Avengers,7.4,12000.0,7.2
26564,Deadpool,7.4,11444.0,7.191409
22879,Interstellar,8.1,11187.0,7.804422
20051,Django Unchained,7.8,10297.0,7.520268
23753,Guardians of the Galaxy,7.9,10014.0,7.600365
2843,Fight Club,8.3,9678.0,7.937681
18244,The Hunger Games,6.9,9634.0,6.724861


# Content Based

### Based on Genre

In [9]:
from ast import literal_eval

> Convert string to List of Object with Literal_eval

In [10]:
cb_genres = meta[meta['vote_count'] > 300][['original_title','genres']]

In [11]:
cb_genres.reset_index(drop=True,inplace=True)

In [12]:
def new_genre(x):
    genres = []
    x = literal_eval(x)
    for item in x:
        genres.append(item['name'])
    return ' '.join(genres)

In [13]:
cb_genres['new_genre'] = cb_genres['genres'].apply(new_genre)

In [14]:
cb_genres.head()

Unnamed: 0,original_title,genres,new_genre
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Adventure Fantasy Family
2,Heat,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Action Crime Drama Thriller
3,GoldenEye,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",Adventure Action Thriller
4,Balto,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",Family Animation Adventure


> Convert to Sparsematrix with countVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer() 
cv_genres_result = cv.fit_transform(cb_genres['new_genre'])

In [17]:
cv_df = pd.DataFrame(cv_genres_result.todense() ,
                     columns=cv.get_feature_names(),index=cb_genres['original_title'])

In [18]:
cv_df.head()

Unnamed: 0_level_0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,fiction,...,horror,movie,music,mystery,romance,science,thriller,tv,war,western
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Jumanji,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Heat,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
GoldenEye,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Balto,0,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Measure similarity
# Cosine similarity

> Cosine Similarity Example

In [20]:
data_1 = 'animation comedy'
data_2 = 'animation adventure'
data_3 = 'horror comedy'
data_4 = 'animation'
data_5 = 'animation comedy'

In [21]:
data = [data_1,data_2,data_3,data_4,data_5]
data = pd.DataFrame(data,columns=['genre'] ,index=['movie 1','movie 2' , 'movie 3' , 'movie 4','movie 5'])

In [22]:
cv_2  = CountVectorizer()
cv_res = cv_2.fit_transform(data['genre'])

In [23]:
pd.DataFrame(cv_res.todense(),columns=cv_2.get_feature_names(),index=data.index)

Unnamed: 0,adventure,animation,comedy,horror
movie 1,0,1,1,0
movie 2,1,1,0,0
movie 3,0,0,1,1
movie 4,0,1,0,0
movie 5,0,1,1,0


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
cos_sim = cosine_similarity(cv_res)

In [26]:
pd.DataFrame(cos_sim , index=data.index,columns=data.index)

Unnamed: 0,movie 1,movie 2,movie 3,movie 4,movie 5
movie 1,1.0,0.5,0.5,0.707107,1.0
movie 2,0.5,1.0,0.0,0.707107,0.5
movie 3,0.5,0.0,1.0,0.0,0.5
movie 4,0.707107,0.707107,0.0,1.0,0.707107
movie 5,1.0,0.5,0.5,0.707107,1.0


> Simmilarity for our data

In [27]:
cos_sim_movie = cosine_similarity(cv_genres_result)

In [28]:
index_to_search = cb_genres[cb_genres['original_title'] == 'Jumanji'].index[0]
cb_genres.iloc[pd.Series(cos_sim_movie[index_to_search]).sort_values(ascending=False).head(10).index]

Unnamed: 0,original_title,genres,new_genre
1618,City of Ember,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
2007,Harry Potter and the Deathly Hallows: Part 2,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",Family Fantasy Adventure
1299,Harry Potter and the Goblet of Fire,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Adventure Fantasy Family
2901,Pete's Dragon,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
862,Harry Potter and the Philosopher's Stone,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Adventure Fantasy Family
165,The Wizard of Oz,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
2666,Pan,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
1577,The Chronicles of Narnia: Prince Caspian,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
402,Labyrinth,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
1303,"The Chronicles of Narnia: The Lion, the Witch ...","[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy


In [29]:
def get_recommendation_based_genres(title):
    index_to_search = cb_genres[cb_genres['original_title'] == title].index[0]
    series_similar = pd.Series(cos_sim_movie[index_to_search])
    index_similar = series_similar.sort_values(ascending=False).head(10).index
    return cb_genres.loc[index_similar]

In [30]:
get_recommendation_based_genres('Toy Story')

Unnamed: 0,original_title,genres,new_genre
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
1173,The SpongeBob SquarePants Movie,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
1293,Chicken Little,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",Animation Family Comedy
1314,Hoodwinked!,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
1345,Over the Hedge,"[{'id': 35, 'name': 'Comedy'}, {'id': 16, 'nam...",Comedy Animation Family
1364,Garfield: A Tail of Two Kitties,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
2374,Cloudy with a Chance of Meatballs 2,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",Animation Family Comedy
1442,Meet the Robinsons,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
1444,How the Grinch Stole Christmas!,"[{'id': 16, 'name': 'Animation'}, {'id': 10751...",Animation Family Comedy
1464,Surf's Up,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family


### Based on Cast and Director

In [31]:
credits = pd.read_csv('credits.csv')
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [32]:
id_movie = pd.to_numeric(meta[meta['vote_count'] > 300]['id'].reset_index(drop=True))
cb_genres['id'] = id_movie
cb_genres_cast_crew = cb_genres.merge(credits,on='id')
cb_genres_cast_crew.shape

(3032, 6)

In [33]:
def new_cast(x):
    casts = []
    x = literal_eval(x)
    for item in x:
        name = item['name'].replace(' ','')
        casts.append(name)
    return ' '.join(casts)

In [34]:
cb_genres_cast_crew['new_cast'] = cb_genres_cast_crew['cast'].apply(new_cast)

In [35]:
cv_res_cast = CountVectorizer().fit_transform(cb_genres_cast_crew['new_cast'])
cos_sim_cast = cosine_similarity(cv_res_cast)

In [36]:
def get_recommendation_based_cast(title):
    index_to_search = cb_genres_cast_crew[cb_genres_cast_crew['original_title'] == title].index[0]
    series_similar = pd.Series(cos_sim_cast[index_to_search])
    index_similar = series_similar.sort_values(ascending=False).head(10).index
    return cb_genres_cast_crew.loc[index_similar][['original_title','new_cast']]

> decide the composition of the similarity metrics

In [37]:
cos_sim_cast_genres = (cos_sim_movie * 0.6) + (0.4 * cos_sim_cast)

In [38]:
def get_recommendation_based_genre_cast(title):
    df_cos_sin_genre_cast = pd.DataFrame(cos_sim_cast_genres,columns=cb_genres_cast_crew.original_title , 
                 index=cb_genres_cast_crew.original_title)
    movie_similar = df_cos_sin_genre_cast[title]
    return movie_similar.sort_values(ascending=False).head(10)

> Recomendation Based On Director

In [39]:
a = [{'name' : 'budi' , 'job' : 'Writer'},{'name' : 'fikri' , 'job' : 'Director'}]

In [40]:
for item in a:
    if(item['job'] == 'Director'):
        print(item['name'])

fikri


In [41]:
def new_director(x):
    
    for item in literal_eval(x):
        if (item['job'] == 'Director'):
            return item['name'].replace(' ','')

In [42]:
cb_genres_cast_crew['director'] = cb_genres_cast_crew['crew'].apply(new_director)

In [43]:
cv_director = CountVectorizer().fit_transform(cb_genres_cast_crew['director'])

In [44]:
sim_director = cosine_similarity(cv_director)

In [45]:
sim_combination = (cos_sim_movie * 0.5) + (cos_sim_cast * 0.4) + (sim_director * 0.1)

In [46]:
def get_recomendation_combination(title):
    index_movie = cb_genres_cast_crew[cb_genres_cast_crew['original_title'] == title].index[0]
    index_similar = pd.Series(sim_combination[index_movie]).sort_values(ascending=False).head(10).index
    return cb_genres_cast_crew[['original_title','new_cast','new_genre','director']].iloc[index_similar]

In [47]:
get_recomendation_combination('Rio')

Unnamed: 0,original_title,new_cast,new_genre,director
1972,Rio,JesseEisenberg AnneHathaway LeslieMann JaneLyn...,Animation Adventure Comedy Family,CarlosSaldanha
2463,Rio 2,JesseEisenberg AnneHathaway LeslieMann JamieFo...,Animation Adventure Comedy Family,CarlosSaldanha
1717,Ice Age: Dawn of the Dinosaurs,RayRomano JohnLeguizamo DenisLeary QueenLatifa...,Animation Comedy Family Adventure,CarlosSaldanha
1327,Ice Age: The Meltdown,RayRomano JohnLeguizamo DenisLeary QueenLatifa...,Animation Family Comedy Adventure,CarlosSaldanha
2175,ParaNorman,KodiSmit-McPhee TuckerAlbrizzi JodelleFerland ...,Family Animation Adventure Comedy,SamFell
2899,Ice Age: Collision Course,RayRomano JohnLeguizamo DenisLeary QueenLatifa...,Adventure Animation Family Comedy,MikeThurmeier
2164,Ice Age: Continental Drift,JohnLeguizamo RayRomano ChrisWedge DenisLeary ...,Animation Comedy Adventure Family,SteveMartino
2216,Wreck-It Ralph,JohnC.Reilly SarahSilverman JackMcBrayer JaneL...,Family Animation Comedy Adventure,RichMoore
1761,Fantastic Mr. Fox,GeorgeClooney MerylStreep JasonSchwartzman Bil...,Adventure Animation Comedy Family,WesAnderson
1350,Cars,OwenWilson PaulNewman BonnieHunt LarrytheCable...,Animation Adventure Comedy Family,JohnLasseter


# Collaborative Filtering

## Item Based

In [48]:
ratings = pd.read_csv('ratings_small.csv')

In [49]:
ratings.shape

(100004, 4)

In [50]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [51]:
index_to_drop = meta[meta.id.str.contains('-')].index
meta.drop(index_to_drop,inplace=True)

In [52]:
# Convert to Numeric
meta['id'] = pd.to_numeric(meta['id'])

In [53]:
meta.rename({'id' : 'movieId'},axis=1,inplace=True)

In [54]:
user_movie_ratings = meta.merge(ratings,on='movieId')[['userId','original_title','rating']]

In [55]:
user_movie_ratings.head()

Unnamed: 0,userId,original_title,rating
0,23,Heat,3.5
1,102,Heat,4.0
2,232,Heat,2.0
3,242,Heat,5.0
4,263,Heat,3.0


In [56]:
user_movie_ratings_pivot = user_movie_ratings.pivot_table(
    index='original_title',columns='userId',values='rating')

In [57]:
user_movie_ratings_pivot.loc['!Women Art Revolution'].isnull().sum()

669

In [58]:
user_movie_ratings_pivot.dropna(thresh=10,inplace=True)

In [59]:
user_movie_ratings_pivot.fillna(0,inplace=True)

In [60]:
similarity_movie = cosine_similarity(user_movie_ratings_pivot)

In [61]:
df_item_similarity = pd.DataFrame(similarity_movie,
             index=user_movie_ratings_pivot.index,columns=user_movie_ratings_pivot.index)

> Simple Recomendation Test

In [62]:
users = [['Titanic',5],['A Perfect Murder', 2],['2 Days in Paris',4]]

In [63]:
ser_1 = (df_item_similarity['Titanic'] * (5 - 2.5)).sort_values(ascending=False)
ser_2 = (df_item_similarity['A Perfect Murder'] * (2 - 2.5)).sort_values(ascending=False)
ser_3 = (df_item_similarity['2 Days in Paris'] * (4 - 2.5)).sort_values(ascending=False)

In [64]:
(ser_1 + ser_2 + ser_3).sort_values(ascending=False).head(10)

original_title
Titanic                      2.730555
2 Days in Paris              1.962323
Psycho                       1.774745
A Nightmare on Elm Street    1.724984
Monsoon Wedding              1.680280
Big Fish                     1.679542
Reservoir Dogs               1.603702
Sissi                        1.597969
Rain Man                     1.572642
Wag the Dog                  1.533183
dtype: float64

In [65]:
def get_cf_item_based(movies):
    serr = []
    for item in movies:
        ser_similar = df_item_similarity[item[0]] * (item[1] - 2.5)
        serr.append(ser_similar)
    similar_total = sum(serr)
    return similar_total.sort_values(ascending=False).head(10)

In [66]:
get_cf_item_based([['Titanic',2],['A Perfect Murder',5]])

original_title
A Perfect Murder           2.407151
Cold Mountain              1.110334
Аэлита                     1.078367
Prime                      1.034842
Yella                      1.002543
Frankenstein               0.981409
Lonely Hearts              0.970602
Dr. Jekyll and Mr. Hyde    0.969958
Payback                    0.964822
8 femmes                   0.939310
dtype: float64

## User Based CF

In [69]:
ratings.userId.describe()

count    100004.000000
mean        347.011310
std         195.163838
min           1.000000
25%         182.000000
50%         367.000000
75%         520.000000
max         671.000000
Name: userId, dtype: float64

In [71]:
user_rating_pivot = ratings.pivot_table(index='userId', columns='movieId',values='rating')

In [78]:
user_rating_pivot = user_rating_pivot.dropna(thresh=30).fillna(0)

In [80]:
# Measure similarity each users

In [81]:
user_similarity_score = cosine_similarity(user_rating_pivot)

In [84]:
df_similarity = pd.DataFrame(user_similarity_score,index=user_rating_pivot.index,columns=user_rating_pivot.index)

In [96]:
user_rating_pivot.loc[2].sort_values(ascending=False).head()

movieId
265    5.0
266    5.0
592    5.0
590    5.0
589    5.0
Name: 2, dtype: float64

In [97]:
new_user = [[265,5],[266,5],[592,5],[590,5],[589,5]]

In [98]:
user_rating_pivot.loc[(user_rating_pivot.index[-1] + 1), [265,266,592,590,589] ] = [5,5,5,5,5] 

In [106]:
user_rating_pivot.fillna(0,inplace=True)

In [107]:
similarity = cosine_similarity(user_rating_pivot)

In [113]:
user_id_similar = pd.DataFrame(similarity,
                               columns=user_rating_pivot.index,
                               index=user_rating_pivot.index).loc[672].sort_values(ascending=False)
[1:11].index

In [119]:
serr = []
for id in user_id_similar:
    movie_rating = user_rating_pivot.loc[id]
    serr.append(movie_rating)

In [125]:
(sum(serr) / 10).sort_values(ascending=False)

movieId
589      4.2
590      4.2
377      4.0
480      3.7
356      3.6
150      3.5
457      3.4
592      3.2
588      3.1
364      3.1
165      3.0
380      3.0
110      2.9
349      2.9
316      2.8
266      2.7
527      2.6
296      2.6
593      2.5
318      2.4
153      2.4
47       2.4
344      2.3
595      2.3
440      2.2
292      2.1
500      2.1
434      2.1
357      2.1
265      2.1
        ... 
31700    0.0
31696    0.0
31694    0.0
31689    0.0
31682    0.0
32031    0.0
31660    0.0
31658    0.0
31549    0.0
31547    0.0
31524    0.0
31522    0.0
31747    0.0
31770    0.0
31804    0.0
31878    0.0
31903    0.0
31921    0.0
31923    0.0
31930    0.0
31952    0.0
31956    0.0
31963    0.0
31973    0.0
32017    0.0
32019    0.0
32022    0.0
32025    0.0
32029    0.0
6247     0.0
Length: 9066, dtype: float64

In [134]:
def get_recomendation_user_based(ratings):
    movie_index = np.array(ratings)[:,0]
    movie_rating = np.array(ratings)[:,1]
    user_rating_pivot.loc[(user_rating_pivot.index[-1] + 1), movie_index ] = movie_rating
    user_rating_pivot.fillna(0,inplace=True)
    similarity = cosine_similarity(user_rating_pivot)
    user_id_similar = pd.DataFrame(similarity,
                               columns=user_rating_pivot.index,
                               index=user_rating_pivot.index).loc[user_rating_pivot.index[-1]].sort_values(ascending=False)[1:11].index
    serr = []
    for id in user_id_similar:
        movie_rating = user_rating_pivot.loc[id]
        serr.append(movie_rating)
    return (sum(serr) / 10).sort_values(ascending=False)