In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('movies_metadata.csv', low_memory=False)

In [3]:
df.shape

(45466, 24)

In [4]:
C = df['vote_average'].mean()

In [5]:
C

5.618207215133889

In [6]:
m = df['vote_count'].quantile(0.90)

In [7]:
m

160.0

In [8]:
q_movies = df.copy().loc[df['vote_count']>= m]

In [9]:
q_movies.shape

(4555, 24)

In [10]:
#calificacion ponderada por pelicula
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
q_movies['Score']= q_movies.apply(weighted_rating,axis=1)

In [12]:
q_movies = q_movies.sort_values('Score', ascending=False)
q_movies[['title', 'vote_count', 'vote_average', 'Score']].head(15)

Unnamed: 0,title,vote_count,vote_average,Score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [13]:
q_movies['overview'].head()

314      Framed in the 1940s for the double murder of h...
834      Spanning the years 1945 to 1955, a chronicle o...
10309    Raj is a rich, carefree, happy-go-lucky second...
12481    Batman raises the stakes in his war on crime. ...
2843     A ticking-time-bomb insomniac and a slippery s...
Name: overview, dtype: object

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

q_movies['overview'] = q_movies['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(q_movies['overview'])

tfidf_matrix.shape

(4555, 19694)

In [15]:
tfidf.get_feature_names()[1:100]



['000',
 '007',
 '10',
 '100',
 '1000',
 '100th',
 '101',
 '108',
 '10b',
 '10mn',
 '10th',
 '11',
 '1138',
 '114',
 '117',
 '119',
 '11th',
 '12',
 '120',
 '1200',
 '1215',
 '1250',
 '12th',
 '13',
 '1300',
 '13th',
 '14',
 '140',
 '1408',
 '142',
 '1429',
 '1492',
 '14pm',
 '14th',
 '15',
 '150',
 '1536',
 '155',
 '15th',
 '15yrs',
 '16',
 '161',
 '1630s',
 '164',
 '16th',
 '17',
 '1740',
 '1760',
 '17th',
 '18',
 '180',
 '1800',
 '1820',
 '1820s',
 '1831',
 '1839',
 '1860',
 '1863',
 '1870s',
 '1874',
 '1875',
 '1879',
 '1880',
 '1880s',
 '1885',
 '1890',
 '1890s',
 '1893',
 '18th',
 '19',
 '1900',
 '1900s',
 '1902',
 '191',
 '1910',
 '1910s',
 '1912',
 '1913',
 '1914',
 '1919',
 '192',
 '1920',
 '1920s',
 '1921',
 '1923',
 '1924',
 '1925',
 '1926',
 '1927',
 '1929',
 '1930',
 '1930s',
 '1931',
 '1932',
 '1933',
 '1934',
 '1935',
 '1936',
 '1937']

In [16]:
from sklearn.metrics.pairwise import linear_kernel

## Creamos la matriz de coseno de similitud
consine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [17]:
consine_sim.shape

(4555, 4555)

In [18]:
consine_sim[1]

array([0.00522362, 1.        , 0.01249039, ..., 0.        , 0.01420965,
       0.01535064])

In [19]:
indices = pd.Series(q_movies.index, index = q_movies['title']).drop_duplicates()

In [20]:
indices[:10]

title
The Shawshank Redemption         314
The Godfather                    834
Dilwale Dulhania Le Jayenge    10309
The Dark Knight                12481
Fight Club                      2843
Pulp Fiction                     292
Schindler's List                 522
Whiplash                       23673
Spirited Away                   5481
Life Is Beautiful               2211
dtype: int64

In [21]:
def recomendar(title, consine_sim = consine_sim):
    idx= indices[title]
    sim_scores = list(enumerate(consine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:11]
    movies_indices = [i[0] for i in sim_scores]
    return q_movies['title'].iloc[movies_indices] 
    
    
    

In [22]:
recomendar('Forrest Gump')

309                              The Swan Princess
29675                          Barbie of Swan Lake
14425                  The Twilight Saga: New Moon
2826                                    Flashdance
18519                     Barbie in the Nutcracker
13133                                     Twilight
18150    The Twilight Saga: Breaking Dawn - Part 1
4309                                      Suspiria
13336                                   The Unborn
10874                                Take the Lead
Name: title, dtype: object

In [23]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

In [24]:
keywords

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
...,...,...
46414,439050,"[{'id': 10703, 'name': 'tragic love'}]"
46415,111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,..."
46416,67758,[]
46417,227506,[]


In [25]:
df = df.drop([19730, 29503, 35587])

In [26]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
df['id'] = df['id'].astype('int')

In [27]:
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

In [28]:
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [29]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feacture in features:
    df[feacture] = df[feacture].apply(literal_eval)

In [30]:
import numpy as np

In [31]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [32]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    #Return empty list in case of missing/malformed data
    return []

In [33]:
df['director'] = df['crew'].apply(get_director)

In [34]:
features = ['cast', 'keywords', 'genres']
for feacture in features:
    df[feacture] = df[feacture].apply(get_list)
    

In [35]:
df[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [36]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [37]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)



In [38]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [39]:
df['soup'] = df.apply(create_soup, axis=1)

In [40]:
df[['soup']].head(2)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...


In [41]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [42]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

MemoryError: Unable to allocate 16.2 GiB for an array with shape (46628, 46628) and data type float64

In [None]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:5]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [None]:
get_recommendations('Jumanji', cosine_sim2)

In [None]:
df