In [62]:
import pandas as pd
import requests
import json

pd.options.display.max_columns = 30


In [63]:
def page_url(page=1):
    '''
    Uses default query specifications
    '''
    api_key  = "api_key=d2ba87d601823962c8cc04aa877278a1"
    discover_api = "https://api.themoviedb.org/3/discover/movie?"
    query = "&language=en-US&sort_by=popularity.desc&include_adult=false&page={}&vote_count.gte=500".format(page)
    url = discover_api + api_key + query
    return url

In [64]:
url = discover_api + api_key + default_query

def make_big_dataframe(max_pages):
    '''
    The API proides data in pages of 10 movies per url request. 
    This function iterates over each page adding the data to one big pandas df
    '''
    if (max_pages >= 214):
        print("max is 213")
        return
    
    frames = []
    
    for page_num in range(1,max_pages+1):
        url = page_url(page_num)
        r = requests.get(url)
        if r.status_code != 200:
            '''Something went wrong'''
            print("Error")
            return
        data = r.json()
        page_df = pd.json_normalize(data = data, record_path = "results") # normalize, as you convert to df
        frames.append(page_df)
       
    # concat dfs into one
    df = pd.concat(frames) # method takes array of df
    
    # process
    df = df.reset_index() # (else they restart at 19)
    df =df[["title",
            "genre_ids",
            "release_date",
            "overview",
            "popularity",
            "vote_count",
            "poster_path",
            "vote_average"]]
    
    return df  

test_frame = make_big_dataframe(10)
# functional    


In [68]:
df = test_frame
df

Unnamed: 0,title,genre_ids,release_date,overview,popularity,vote_count,poster_path,vote_average
0,Ad Astra,"[18, 878]",2019-09-17,"The near future, a time when both hope and har...",165.765,3802,/xBHvZcjRiWyobQ9kxBhO6B2dtRI.jpg,6.1
1,Sonic the Hedgehog,"[28, 35, 878, 10751]",2020-02-12,Based on the global blockbuster videogame fran...,117.793,4784,/aQvJ5WPzZgYVDrxLX4R6cLJCEaQ.jpg,7.5
2,Parasite,"[35, 18, 53]",2019-05-30,"All unemployed, Ki-taek's family takes peculia...",96.606,8123,/7IiTTgloJzvGI1TAYymCfbfl3vT.jpg,8.5
3,Avengers: Infinity War,"[28, 12, 878]",2018-04-25,As the Avengers and their allies have continue...,82.688,18663,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,8.3
4,The Invisible Man,"[27, 878, 53]",2020-02-26,When Cecilia's abusive ex takes his own life a...,94.348,2225,/5EufsDwXdY2CVttYOk2WtYhgKpa.jpg,7.2
...,...,...,...,...,...,...,...,...
195,Thor,"[28, 12, 14]",2011-04-21,"Against his father Odin's will, The Mighty Tho...",27.471,15015,/prSfAi1xGrhLQNxVSUFh61xQ4Qy.jpg,6.7
196,Alien: Covenant,"[27, 878]",2017-05-09,Bound for a remote planet on the far side of t...,25.225,5795,/zecMELPbU5YMQpC81Z8ImaaXuf9.jpg,5.9
197,The Mask,"[35, 80, 14, 10749]",1994-07-29,When timid bank clerk Stanley Ipkiss discovers...,34.309,6684,/yd3wjLLeQHJ3oRDy4wwiNNFVyLW.jpg,6.9
198,Mission: Impossible - Fallout,"[28, 12]",2018-07-13,"When an IMF mission ends badly, the world is f...",32.475,5100,/AkJQpZp9WoNdj7pLYSj1L0RcMMN.jpg,7.3


### Now the ML processing starts

In [69]:
 ## add a column with the concatenation of TITLE and OVERVIEW

In [70]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [71]:
FEATURES = ["title", "overview"]

def combine_features(row):
    combined_str = ""
    for feat in FEATURES:
        combined_str += row[feat] + " "
    return combined_str


# at a global level (to add the combined features)
# apply is like map for each row
df["combined_features"] = df.apply(combine_features,axis=1) #applying combined_features() method over each rows of dataframe and storing the combined string in "combined_features" column

    

In [72]:
df

Unnamed: 0,title,genre_ids,release_date,overview,popularity,vote_count,poster_path,vote_average,combined_features
0,Ad Astra,"[18, 878]",2019-09-17,"The near future, a time when both hope and har...",165.765,3802,/xBHvZcjRiWyobQ9kxBhO6B2dtRI.jpg,6.1,"Ad Astra The near future, a time when both hop..."
1,Sonic the Hedgehog,"[28, 35, 878, 10751]",2020-02-12,Based on the global blockbuster videogame fran...,117.793,4784,/aQvJ5WPzZgYVDrxLX4R6cLJCEaQ.jpg,7.5,Sonic the Hedgehog Based on the global blockbu...
2,Parasite,"[35, 18, 53]",2019-05-30,"All unemployed, Ki-taek's family takes peculia...",96.606,8123,/7IiTTgloJzvGI1TAYymCfbfl3vT.jpg,8.5,"Parasite All unemployed, Ki-taek's family take..."
3,Avengers: Infinity War,"[28, 12, 878]",2018-04-25,As the Avengers and their allies have continue...,82.688,18663,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,8.3,Avengers: Infinity War As the Avengers and the...
4,The Invisible Man,"[27, 878, 53]",2020-02-26,When Cecilia's abusive ex takes his own life a...,94.348,2225,/5EufsDwXdY2CVttYOk2WtYhgKpa.jpg,7.2,The Invisible Man When Cecilia's abusive ex ta...
...,...,...,...,...,...,...,...,...,...
195,Thor,"[28, 12, 14]",2011-04-21,"Against his father Odin's will, The Mighty Tho...",27.471,15015,/prSfAi1xGrhLQNxVSUFh61xQ4Qy.jpg,6.7,"Thor Against his father Odin's will, The Might..."
196,Alien: Covenant,"[27, 878]",2017-05-09,Bound for a remote planet on the far side of t...,25.225,5795,/zecMELPbU5YMQpC81Z8ImaaXuf9.jpg,5.9,Alien: Covenant Bound for a remote planet on t...
197,The Mask,"[35, 80, 14, 10749]",1994-07-29,When timid bank clerk Stanley Ipkiss discovers...,34.309,6684,/yd3wjLLeQHJ3oRDy4wwiNNFVyLW.jpg,6.9,The Mask When timid bank clerk Stanley Ipkiss ...
198,Mission: Impossible - Fallout,"[28, 12]",2018-07-13,"When an IMF mission ends badly, the world is f...",32.475,5100,/AkJQpZp9WoNdj7pLYSj1L0RcMMN.jpg,7.3,Mission: Impossible - Fallout When an IMF miss...


In [83]:
# now feed combined_features strings into CountVectorize object
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"]) 


In [84]:
count_matrix

<200x3223 sparse matrix of type '<class 'numpy.int64'>'
	with 8404 stored elements in Compressed Sparse Row format>

In [85]:
#print(cv.get_feature_names())
#print(count_matrix.toarray())

In [88]:
## get cosine similarity matrix
cosine_matrix = cosine_similarity(count_matrix)
cosine_matrix   # symmetrical matrix with diagonals = 1 (100 similar to each other)

array([[1.        , 0.44984383, 0.19820624, ..., 0.40729903, 0.41024184,
        0.45457002],
       [0.44984383, 1.        , 0.19813746, ..., 0.43294434, 0.40560277,
        0.38781737],
       [0.19820624, 0.19813746, 1.        , ..., 0.16145842, 0.1711842 ,
        0.12427395],
       ...,
       [0.40729903, 0.43294434, 0.16145842, ..., 1.        , 0.29021087,
        0.35113883],
       [0.41024184, 0.40560277, 0.1711842 , ..., 0.29021087, 1.        ,
        0.29783232],
       [0.45457002, 0.38781737, 0.12427395, ..., 0.35113883, 0.29783232,
        1.        ]])

In [117]:
# HELPER functions
def get_title_from_index(index, df):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title, df):
    # should iterate over all and see if coantains string
    title = title.title()
    return df[df.title == title].index.values[0]

print(get_title_from_index(4, df))
print(get_index_from_title("Thor", df))
print(get_index_from_title("Avatar", df))

The Invisible Man
195
126


Our next step is to get the title of the movie that the user currently likes. Then we will find the index of that movie. After that, we will access the row corresponding to this movie in the similarity matrix. Thus, we will get the similarity scores of all other movies from the current movie. Then we will enumerate through all the similarity scores of that movie to make a tuple of movie index and similarity score. This will convert a row of similarity scores like this- [1 0.5 0.2 0.9] to this- [(0, 1) (1, 0.5) (2, 0.2) (3, 0.9)] . Here, each item is in this form- (movie index, similarity score).

In [121]:
movie_user_likes = "Avatar"
movie_index = get_index_from_title(movie_user_likes, df)
similar_movies = list(enumerate(cosine_matrix[movie_index])) 
# accessing the row corresponding to given movie to find all the similarity scores
# for that movie and then enumerating over it
similar_movies  ## list of tuples (movie_index, similarity score to movie_user_likes)

[(0, 0.37181247169316833),
 (1, 0.32448554053666706),
 (2, 0.2105587219030789),
 (3, 0.2856851356570029),
 (4, 0.1287565506561512),
 (5, 0.2586032735340944),
 (6, 0.35349605916013666),
 (7, 0.29821002598961344),
 (8, 0.18832944617230332),
 (9, 0.16081688022566923),
 (10, 0.18761969359661412),
 (11, 0.3062059890175074),
 (12, 0.17868542247296582),
 (13, 0.1939524551505708),
 (14, 0.28661838988431687),
 (15, 0.304275935729018),
 (16, 0.3011721230063664),
 (17, 0.2495492591311128),
 (18, 0.22287124708010464),
 (19, 0.27429019252949666),
 (20, 0.17855051462249658),
 (21, 0.25997347344787264),
 (22, 0.33765162622095446),
 (23, 0.24675590100659828),
 (24, 0.36594210704893765),
 (25, 0.3652311999283316),
 (26, 0.2394201085048437),
 (27, 0.161627045958809),
 (28, 0.36883813286720696),
 (29, 0.20632815353005762),
 (30, 0.23932498098297528),
 (31, 0.3032392174315614),
 (32, 0.2214546129433066),
 (33, 0.20057388927143846),
 (34, 0.3376516262209545),
 (35, 0.3304993437657777),
 (36, 0.321027475726

In [124]:
# sort
# [from:to] notation
# labda var : var[1] (tuple is (0,1)) so you are sorting based on similarity scores
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)
sorted_similar_movies

## boom we done

[(126, 0.9999999999999997),
 (87, 0.40585624702096074),
 (143, 0.3995746439394742),
 (38, 0.39868420696101237),
 (96, 0.3855289616378948),
 (37, 0.37820575262078365),
 (196, 0.3761086016708255),
 (56, 0.37523938719322814),
 (0, 0.37181247169316833),
 (134, 0.37164566483340783),
 (28, 0.36883813286720696),
 (111, 0.3677315656764611),
 (69, 0.3662346092966271),
 (24, 0.36594210704893765),
 (25, 0.3652311999283316),
 (187, 0.360432779158473),
 (112, 0.36030187928883595),
 (82, 0.35959747611403803),
 (166, 0.3543059966305303),
 (133, 0.35377908700348826),
 (6, 0.35349605916013666),
 (93, 0.34776361258027355),
 (104, 0.34045327482409776),
 (157, 0.339502302698635),
 (51, 0.3394167965134989),
 (117, 0.338446709524953),
 (136, 0.337941364303121),
 (34, 0.3376516262209545),
 (22, 0.33765162622095446),
 (55, 0.33259278156261735),
 (176, 0.3319062115909338),
 (144, 0.3315938128703508),
 (35, 0.3304993437657777),
 (120, 0.32912131438150566),
 (116, 0.3282660821493064),
 (74, 0.32810591327802474),

In [125]:
i = 0

for tup in sorted_similar_movies:
    index, score =  tup
    print(get_title_from_index(index, df))
    i+=1
    if i > 5:
        break

Avatar
Brave
Avengers: Age of Ultron
Thor: Ragnarok
The Hot Chick
The Lord of the Rings: The Return of the King


In [None]:
### instead of this forloop, we would get entire rows by index and output that to HTML