# Demographic Filtering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
meta = pd.read_csv('movies_metadata.csv',low_memory=False)

In [3]:
vote_data = meta[['original_title','vote_average','vote_count']]

In [4]:
vote_data['vote_average'].mean()

5.618207215133889

In [5]:
def weighted_rating(row):
    v = row['vote_count']
    m = 1500
    r = row['vote_average']
    c = 5.6
    wr = ((v / (v+m))*r) + ((m/(v+m))*c)
    return wr
    

In [6]:
vote_data['score'] = vote_data.apply(weighted_rating,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [7]:
vote_data.sort_values(by='score',ascending=False).head(10)

Unnamed: 0,original_title,vote_average,vote_count,score
314,The Shawshank Redemption,8.5,8358.0,8.058734
12481,The Dark Knight,8.3,12269.0,8.005861
2843,Fight Club,8.3,9678.0,7.937681
834,The Godfather,8.5,6024.0,7.92185
292,Pulp Fiction,8.3,8670.0,7.90177
15480,Inception,8.1,14075.0,7.85923
22879,Interstellar,8.1,11187.0,7.804422
351,Forrest Gump,8.2,8147.0,7.795729
7000,The Lord of the Rings: The Return of the King,8.1,8226.0,7.714436
1154,The Empire Strikes Back,8.2,5998.0,7.679861


In [8]:
vote_data.sort_values(by='vote_count',ascending=False).head(10)

Unnamed: 0,original_title,vote_average,vote_count,score
15480,Inception,8.1,14075.0,7.85923
12481,The Dark Knight,8.3,12269.0,8.005861
14551,Avatar,7.2,12114.0,7.023711
17818,The Avengers,7.4,12000.0,7.2
26564,Deadpool,7.4,11444.0,7.191409
22879,Interstellar,8.1,11187.0,7.804422
20051,Django Unchained,7.8,10297.0,7.520268
23753,Guardians of the Galaxy,7.9,10014.0,7.600365
2843,Fight Club,8.3,9678.0,7.937681
18244,The Hunger Games,6.9,9634.0,6.724861


# Content Based

### Based on Genre

In [9]:
from ast import literal_eval

> Convert string to List of Object with Literal_eval

In [10]:
cb_genres = meta[meta['vote_count'] > 300][['original_title','genres']]

In [11]:
cb_genres.reset_index(drop=True,inplace=True)

In [12]:
def new_genre(x):
    genres = []
    x = literal_eval(x)
    for item in x:
        genres.append(item['name'])
    return ' '.join(genres)

In [13]:
cb_genres['new_genre'] = cb_genres['genres'].apply(new_genre)

In [14]:
cb_genres.head()

Unnamed: 0,original_title,genres,new_genre
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Animation Comedy Family
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Adventure Fantasy Family
2,Heat,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Action Crime Drama Thriller
3,GoldenEye,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",Adventure Action Thriller
4,Balto,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",Family Animation Adventure


> Convert to Sparsematrix with countVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
cv = CountVectorizer() 
cv_genres_result = cv.fit_transform(cb_genres['new_genre'])

In [17]:
cv_df = pd.DataFrame(cv_genres_result.todense() ,
                     columns=cv.get_feature_names(),index=cb_genres['original_title'])

In [18]:
cv_df.head()

Unnamed: 0_level_0,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,fiction,...,horror,movie,music,mystery,romance,science,thriller,tv,war,western
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Jumanji,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Heat,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
GoldenEye,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Balto,0,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Measure similarity
# Cosine similarity

> Cosine Similarity Example

In [20]:
data_1 = 'animation comedy'
data_2 = 'animation adventure'
data_3 = 'horror comedy'
data_4 = 'animation'
data_5 = 'animation comedy'

In [21]:
data = [data_1,data_2,data_3,data_4,data_5]
data = pd.DataFrame(data,columns=['genre'] ,index=['movie 1','movie 2' , 'movie 3' , 'movie 4','movie 5'])

In [22]:
cv_2  = CountVectorizer()
cv_res = cv_2.fit_transform(data['genre'])

In [23]:
pd.DataFrame(cv_res.todense(),columns=cv_2.get_feature_names(),index=data.index)

Unnamed: 0,adventure,animation,comedy,horror
movie 1,0,1,1,0
movie 2,1,1,0,0
movie 3,0,0,1,1
movie 4,0,1,0,0
movie 5,0,1,1,0


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
cos_sim = cosine_similarity(cv_res)

In [26]:
pd.DataFrame(cos_sim , index=data.index,columns=data.index)

Unnamed: 0,movie 1,movie 2,movie 3,movie 4,movie 5
movie 1,1.0,0.5,0.5,0.707107,1.0
movie 2,0.5,1.0,0.0,0.707107,0.5
movie 3,0.5,0.0,1.0,0.0,0.5
movie 4,0.707107,0.707107,0.0,1.0,0.707107
movie 5,1.0,0.5,0.5,0.707107,1.0


> Simmilarity for our data

In [27]:
cos_sim_movie = cosine_similarity(cv_genres_result)

In [28]:
index_to_search = cb_genres[cb_genres['original_title'] == 'Jumanji'].index[0]
cb_genres.iloc[pd.Series(cos_sim_movie[index_to_search]).sort_values(ascending=False).head(10).index]

Unnamed: 0,original_title,genres,new_genre
1618,City of Ember,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
2007,Harry Potter and the Deathly Hallows: Part 2,"[{'id': 10751, 'name': 'Family'}, {'id': 14, '...",Family Fantasy Adventure
1299,Harry Potter and the Goblet of Fire,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Adventure Fantasy Family
2901,Pete's Dragon,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
862,Harry Potter and the Philosopher's Stone,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Adventure Fantasy Family
165,The Wizard of Oz,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
2666,Pan,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
1577,The Chronicles of Narnia: Prince Caspian,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
402,Labyrinth,"[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy
1303,"The Chronicles of Narnia: The Lion, the Witch ...","[{'id': 12, 'name': 'Adventure'}, {'id': 10751...",Adventure Family Fantasy


In [29]:
def get_recommendation_based_genres(title):
    index_to_search = cb_genres[cb_genres['original_title'] == title].index[0]
    series_similar = pd.Series(cos_sim_movie[index_to_search])
    index_similar = series_similar.sort_values(ascending=False).head(10).index
    return cb_genres.loc[index_similar]

### Based on Cast and Director

In [30]:
credits = pd.read_csv('credits.csv')

In [31]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [32]:
id_movie = pd.to_numeric(meta[meta['vote_count'] > 300]['id'])
cb_genres['id'] = id_movie

In [33]:
cb_genres_cast_crew = cb_genres.merge(credits,on='id')

In [34]:
cb_genres_cast_crew.shape

(635, 6)

In [35]:
def new_cast(x):
    casts = []
    x = literal_eval(x)
    for item in x:
        name = item['name'].replace(' ','')
        casts.append(name)
    return ' '.join(casts)

In [36]:
cb_genres_cast_crew['new_cast'] = cb_genres_cast_crew['cast'].apply(new_cast)

In [37]:
cv_res_cast = CountVectorizer().fit_transform(cb_genres_cast_crew['new_cast'])

In [38]:
cos_sim_cast = cosine_similarity(cv_res_cast)

In [42]:
def get_recommendation_based_cast(title):
    index_to_search = cb_genres_cast_crew[cb_genres_cast_crew['original_title'] == title].index[0]
    series_similar = pd.Series(cos_sim_cast[index_to_search])
    index_similar = series_similar.sort_values(ascending=False).head(10).index
    return cb_genres_cast_crew.loc[index_similar][['original_title','new_cast']]

In [43]:
get_recommendation_based_cast('Jumanji')

Unnamed: 0,original_title,new_cast
1,Jumanji,RobinWilliams JonathanHyde KirstenDunst Bradle...
448,Transformers: Dark of the Moon,JamieLeeCurtis JoshHartnett AdamArkin Michelle...
634,Shot Caller,TomHanks MichaelClarkeDuncan DavidMorse Bonnie...
29,Trainspotting,RobinWilliams GeneHackman NathanLane DianneWie...
81,Fallen,MacaulayCulkin JohnLarroquette EdwardHerrmann ...
138,Remember the Titans,RobinWilliams DianeLane BrianKerwin JenniferLo...
317,Dead Silence,JenniferLopez IceCube JonVoight EricStoltz Owe...
188,Runaway Jury,ScottWeinger LindaLarkin RobinWilliams JohnRhy...
356,Bangkok Dangerous,MegRyan JohnCusack ChristopherLloyd AngelaLans...
53,The Graduate,BradPitt TomCruise KirstenDunst AntonioBandera...
