# Adding keywords, collections and regions features

## Extracting keywords features

### Import all movies dataframe

In [1]:
import pandas as pd
df_movies = pd.read_json("json_dict\movie_ids_03_16_2022.json", lines=True)
df_movies.set_index('id', inplace=True)
df_movies.sort_index(inplace=True)
df_movies.head()

Unnamed: 0_level_0,adult,original_title,popularity,video
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,False,Ariel,9.408,False
3,False,Varjoja paratiisissa,10.931,False
5,False,Four Rooms,13.555,False
6,False,Judgment Night,10.25,False
8,False,Life in Loops (A Megacities RMX),3.416,False


### Import all keywords dictionary

In [2]:
import json

def get_keywords_dict():
    keywords_list = []
    with open("json_dict\keyword_ids_03_24_2022.json", "r", encoding="utf-8") as f:
        for line in f:
            keywords_list.append(json.loads(line))
    keywords_dict = {}
    for keyword in keywords_list:
        keywords_dict[keyword["id"]] = keyword["name"]
    return keywords_dict

keywords_dict = get_keywords_dict()

### Request keywords movie function
The function return a list of keywords for one movie.

In [3]:
import requests

def movie_keywords(movie_id: int):
    # Api data and movie index example
    api_key = "44f9a7f09387a49408460a6d158e1f44"

    # url using movie id = 2 example
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "/keywords?api_key=" + api_key

    # Request keywords info
    request = requests.get(url)
    
    if request.status_code == 200:
        request = json.loads(request.text)
        if request.get("keywords") != None:
            word_list = []
            for line in request.get("keywords"):
                word_list.append(line["name"])
            return word_list

## Looking for top 100 keywords in all movies
We extract keywords for aleatory movies, we will use 100000 movies size sample to obtain a top 100 of keywords and adding to "dataset_movies.csv".

### Iterative configuration

In [15]:
# Iteration parameters
n = len(df_movies)  # All movies size

nbatch_it = 100   # Number of batches to process in iteration
batch_size = 100    # Number of movies per batch
nmovies = batch_size*nbatch_it  # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:", nbatch_it)
print("Batch size is:", batch_size)
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 100
Batch size is: 100
Iteration will process 10000 movies


### Iterative process

In [19]:
import datetime
import numpy as np
from time import time
import glob
import gc

start_time = time()
# Iterate batches
for i in range(nbatch_it+1)[1:]:
    # Create aleatory indexes
    indexs = np.random.randint(2, n, batch_size)  # not 1 because delete csv file

    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, nbatch_it))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')

    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs]:
        
        # Get keywords for movies
        keywords_per_list = movie_keywords(movie_id)
        if keywords_per_list != None:
            # Append keywords to data list
            data_list.extend(keywords_per_list)
            counter += 1
        
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")

    if glob.glob("export_files\keywords_list.csv"):
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.to_csv("export_files\keywords_list.csv", mode="a", header=False)
    else:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list,columns=["keywords"])
        data.to_csv("export_files\keywords_list.csv")

    # del data and clean memory
    del data
    del data_list
    gc.collect()

    # Stop condition
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i, nbatch_it))
        print("movie {}/{} has been processed".format(counter, nmovies))
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i, nbatch_it))
    print("movie {} has been processed".format(counter))
    print("Process end:",datetime.datetime.now())
    print("Elapsed time: {} h.".format((time()-start_time)/3600))


Processing Batch 1/100.
Processed movies 10090/10000. Remaining time: -0.0000 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 2/100.
Processed movies 10190/10000. Remaining time: -0.0001 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 3/100.
Processed movies 10290/10000. Remaining time: -0.0003 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 4/100.
Processed movies 10390/10000. Remaining time: -0.0006 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 5/100.
Processed movies 10490/10000. Remaining time: -0.0009 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 6/100.
Processed movies 10590/10000. Remaining time: -0.0013 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 7/100.
Processed movies

### Top 100 keywords

In [4]:
keywords = pd.read_csv("export_files\keywords_list.csv")
keywords.drop("Unnamed: 0",axis=1,inplace=True)
print("keywords list length:",keywords.shape[0])
top_100_keywords = keywords.value_counts().sort_values(ascending=False)[:100]
del keywords
top_100_keywords

keywords list length: 62521


keywords              
short film                1892
woman director            1555
based on novel or book     511
murder                     451
musical                    327
                          ... 
1970s                       65
sibling relationship        65
surrealism                  64
anthology                   64
france                      64
Length: 100, dtype: int64

In [5]:
temp = []
for keyword in top_100_keywords.index:
    temp.append(keyword[0])
top_100_keywords = temp
del temp
top_100_keywords

['short film',
 'woman director',
 'based on novel or book',
 'murder',
 'musical',
 'concert',
 'silent film',
 'biography',
 'sports',
 'stand-up comedy',
 'lgbt',
 'christmas',
 'world war ii',
 'revenge',
 'family',
 'love',
 'anime',
 'philippines',
 'based on true story',
 'martial arts',
 'friendship',
 'romance',
 'coming of age',
 'softcore',
 'black and white',
 'kidnapping',
 'wrestling',
 'opera',
 'new york city',
 'ghost',
 'based on play or musical',
 'sequel',
 'police',
 'politics',
 'serial killer',
 'horror',
 'pre-code',
 'found footage',
 'prison',
 'holiday',
 'erotic movie',
 'drugs',
 'vampire',
 'zombie',
 'rape',
 'parent child relationship',
 'death',
 'gay interest',
 'dance',
 'remake',
 'high school',
 'art',
 'dark comedy',
 'monster',
 'slasher',
 'gay',
 'religion',
 'stop motion',
 'dog',
 'gore',
 'time travel',
 'supernatural',
 'lost film',
 'marriage',
 'alien',
 'football (soccer)',
 'gangster',
 'detective',
 'superhero',
 'nazi',
 'africa',
 'su

### Binarize keywords features

In [6]:
def binary_keyword(movie_id: int):
    row = movie_keywords(movie_id)
    if row != None:
        keywords_by_movie_dict = {}
        keywords_by_movie_dict["id"] = movie_id
            
        for all_key in top_100_keywords:
            keywords_by_movie_dict[all_key] = 0
            for keyword in row:
                if keyword == all_key:
                    keywords_by_movie_dict[all_key] = 1
        return keywords_by_movie_dict

In [7]:
binary_keyword(2)

{'id': 2,
 'short film': 0,
 'woman director': 0,
 'based on novel or book': 0,
 'murder': 0,
 'musical': 0,
 'concert': 0,
 'silent film': 0,
 'biography': 0,
 'sports': 0,
 'stand-up comedy': 0,
 'lgbt': 0,
 'christmas': 0,
 'world war ii': 0,
 'revenge': 0,
 'family': 0,
 'love': 0,
 'anime': 0,
 'philippines': 0,
 'based on true story': 0,
 'martial arts': 0,
 'friendship': 0,
 'romance': 0,
 'coming of age': 0,
 'softcore': 0,
 'black and white': 0,
 'kidnapping': 0,
 'wrestling': 0,
 'opera': 0,
 'new york city': 0,
 'ghost': 0,
 'based on play or musical': 0,
 'sequel': 0,
 'police': 0,
 'politics': 0,
 'serial killer': 0,
 'horror': 0,
 'pre-code': 0,
 'found footage': 0,
 'prison': 1,
 'holiday': 0,
 'erotic movie': 0,
 'drugs': 0,
 'vampire': 0,
 'zombie': 0,
 'rape': 0,
 'parent child relationship': 0,
 'death': 0,
 'gay interest': 0,
 'dance': 0,
 'remake': 0,
 'high school': 0,
 'art': 0,
 'dark comedy': 0,
 'monster': 0,
 'slasher': 0,
 'gay': 0,
 'religion': 0,
 'stop mo

## Iterative keyword features extraction

### Iterative configuration

In [12]:
import numpy as np
# Iteration parameters
n = len(df_movies)  # All movies size
nbatch = 4000       # Number of Batches

# Create index partition
indexs = np.linspace(0, len(df_movies), nbatch, dtype=int)
batch_size = indexs[1]

# Initial batch, stop batch and initial counter (movies)
a = 152                 # Initiates on batch a -> [1,nbatch]
b = 200                 # End on batch b -> [1,nbatch]
nbatch_it = b-a+1   # Number of batches to process in iteration
nmovies = batch_size*nbatch_it  # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:", nbatch)
print("Batch size is:", batch_size)
print("Iteration will process from {} to {} batch".format(a, b))
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 4000
Batch size is: 172
Iteration will process from 152 to 200 batch
Iteration will process 8428 movies


### Iterative process

In [13]:
import datetime
from time import time
import gc
start_time = time()
# Iterate batches
for i in range(len(indexs)+1)[a:(b+1)]:
    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, b))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')

    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs[i-1]:indexs[i]]:
        # Append row to data list
        data_list.append(binary_keyword(movie_id))
        counter += 1
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")

    if i == 1:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list)
        data.set_index("id", inplace=True)
        data.to_csv("export_files\dataset_keywords.csv")
    else:
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.set_index("id", inplace=True)
        data.to_csv("export_files\dataset_keywords.csv", mode="a", header=False)

    # del data and clean memory
    del data
    del data_list
    gc.collect()

    # Stop condition
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i, b))
        print("movie {}/{} has been processed".format(counter, nmovies))
        print("Process end:", datetime.datetime.now())
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i, b))
    print("movie {} has been processed".format(counter))
    print("Process end:", datetime.datetime.now())
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 152/200.
Processed movies 170/8428. Remaining time: 0.2973 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 153/200.
Processed movies 340/8428. Remaining time: 0.2937 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 154/200.
Processed movies 510/8428. Remaining time: 0.2878 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 155/200.
Processed movies 680/8428. Remaining time: 0.2829 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 156/200.
Processed movies 860/8428. Remaining time: 0.2757 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 157/200.
Processed movies 1030/8428. Remaining time: 0.2718 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 158/200.
Processed movies 1200/842

In [46]:
keywords_df = pd.read_csv("export_files\dataset_keywords.csv")
keywords_df.set_index("id",inplace=True)
keywords_df

Unnamed: 0_level_0,short film,woman director,based on novel or book,murder,musical,concert,silent film,biography,sports,stand-up comedy,...,avant-garde,school,robbery,dutch cabaret,"london, england",1970s,sibling relationship,surrealism,anthology,france
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50789,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50791,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Removing duplicates

In [47]:
print("Number of movies before removing:", keywords_df.shape[0])
new_idx = keywords_df.index.drop_duplicates()
keywords_df = keywords_df.loc[new_idx]
print("Number of movies after drop duplicates:", keywords_df.shape[0])

Number of movies before removing: 34424
Number of movies after drop duplicates: 34424


### Update dataframe

In [48]:
keywords_df.to_csv("export_files\dataset_keywords.csv")

## Extracting collections features

In [17]:
collections = pd.read_json(
    "json_dict\collection_ids_03_24_2022.json", lines=True)
collections.set_index("id", inplace=True)
collections

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
10,Star Wars Collection
84,Indiana Jones Collection
119,The Lord of the Rings Collection
131,Trois Couleurs Collection
151,Star Trek: The Original Series Collection
...,...
952748,Luccas Neto O Mapa do Tesouro
953097,Les fables en délire
953155,Košarkar naj bo Collection
953177,Mince alors ! - Saga


### Function belong_collection()

In [18]:
#Process request function
def belong_collection(movie_id):
    api_key = "44f9a7f09387a49408460a6d158e1f44"
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "?api_key=" + api_key
    request = requests.get(url)
    if request.status_code == 200:
        movie = json.loads(requests.get(url).text)
        if movie["belongs_to_collection"]:
            return 1
        else:
            return 0

In [19]:
print(belong_collection(11))

1


### Iterative configuration

In [21]:
# Iteration parameters
n = len(df_movies)  # All movies size
nbatch = 4000       # Number of Batches

# Create index partition
indexs = np.linspace(0, len(df_movies), nbatch, dtype=int)
batch_size = indexs[1]

# Initial batch, stop batch and initial counter (movies)
a = 102                 # Initiates on batch a -> [1,nbatch]
b = 150                 # End on batch b -> [1,nbatch]
nbatch_it = b-a+1   # Number of batches to process in iteration
nmovies = batch_size*nbatch_it # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:",nbatch)
print("Batch size is:",batch_size)
print("Iteration will process from {} to {} batch".format(a,b))
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 4000
Batch size is: 172
Iteration will process from 102 to 150 batch
Iteration will process 8428 movies


### Iterative process

In [22]:
import datetime
start_time = time()
# Iterate batches
for i in range(len(indexs)+1)[a:(b+1)]:
    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, b))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')
    
    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs[i-1]:indexs[i]]:
        # Append row to data list
        data_list.append([movie_id,belong_collection(movie_id)])
        counter += 1
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")
    
    if i == 1:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list,columns=["id","collection"])
        data.set_index("id",inplace=True)
        data.to_csv("export_files\dataset_collections.csv")
    else:
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list,columns=["id","collection"])
        data.set_index("id",inplace=True)
        data.to_csv("export_files\dataset_collections.csv",mode="a",header=False)
    
    # del data and clean memory
    del data
    del data_list
    gc.collect()
    
    # Stop condition    
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i,b))
        print("movie {}/{} has been processed".format(counter,nmovies))
        print("Process end:", datetime.datetime.now())
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i,b))
    print("movie {} has been processed".format(counter))
    print("Process end:", datetime.datetime.now())
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 102/150.
Processed movies 170/8428. Remaining time: 0.4246 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 103/150.
Processed movies 340/8428. Remaining time: 0.4797 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 104/150.
Processed movies 510/8428. Remaining time: 0.4906 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 105/150.
Processed movies 680/8428. Remaining time: 0.4904 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 106/150.
Processed movies 860/8428. Remaining time: 0.4878 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 107/150.
Processed movies 1030/8428. Remaining time: 0.4831 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 108/150.
Processed movies 1200/842

### Loading dataframe from file

In [35]:
collections_df = pd.read_csv("export_files\dataset_collections.csv")
collections_df.set_index("id", inplace=True)
collections_df

Unnamed: 0_level_0,collection
id,Unnamed: 1_level_1
2,0
3,0
5,0
6,0
8,0
...,...
40484,0
40485,0
40486,0
40487,1


### Removing duplicates

In [43]:
print("Number of movies before removing:", collections_df.shape[0])
new_idx = collections_df.index.drop_duplicates()
collections_df = collections_df.loc[new_idx]
print("Number of movies after drop duplicates:", collections_df.shape[0])

Number of movies before removing: 28056
Number of movies after drop duplicates: 28056


### Update dataframe

In [45]:
collections_df.to_csv("export_files\dataset_collections.csv")

## Extract regions features

In [1]:
import requests
import json
import pandas as pd

# Api data and movie index example
api_key = "44f9a7f09387a49408460a6d158e1f44"

# URL to retrieve movie info
url = "https://api.themoviedb.org/3/watch/providers/regions?api_key=" + \
    str(api_key)+"&language=en-US"

# Making a request example
row = json.loads(requests.get(url).text).get("results")

# Create pandas dataframe
regions = pd.DataFrame(row)
regions.set_index("iso_3166_1", inplace=True)
regions.drop("native_name", axis=1, inplace=True)
regions

Unnamed: 0_level_0,english_name
iso_3166_1,Unnamed: 1_level_1
AE,United Arab Emirates
AR,Argentina
AT,Austria
AU,Australia
BE,Belgium
BG,Bulgaria
BR,Brazil
CA,Canada
CH,Switzerland
CZ,Czech Republic
