# Adding keywords, collections and regions features

## Extracting keywords features

### Import all movies dataframe

In [1]:
import pandas as pd
df_movies = pd.read_json("json_dict\movie_ids_03_16_2022.json", lines=True)
df_movies.set_index('id', inplace=True)
df_movies.sort_index(inplace=True)
df_movies.head()

Unnamed: 0_level_0,adult,original_title,popularity,video
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,False,Ariel,9.408,False
3,False,Varjoja paratiisissa,10.931,False
5,False,Four Rooms,13.555,False
6,False,Judgment Night,10.25,False
8,False,Life in Loops (A Megacities RMX),3.416,False


### Import all keywords dictionary

In [2]:
import json

keywords_list = []
with open("json_dict\keyword_ids_03_24_2022.json", "r", encoding="utf-8") as f:
    for line in f:
        keywords_list.append(json.loads(line))
keywords_dict = {}
for keyword in keywords_list:
    keywords_dict[keyword["id"]] = keyword["name"]
del keywords_list
keywords_dict

{378: 'prison',
 240: 'underdog',
 1787: 'helsinki, finland',
 730: 'factory worker',
 1361: 'salesclerk',
 4529: 'diplomat',
 1008: 'guerrilla warfare',
 612: 'hotel',
 613: "new year's eve",
 616: 'witch',
 622: 'bet',
 2700: 'sperm',
 922: 'hotel room',
 2231: 'drug dealer',
 520: 'chicago, illinois',
 3737: 'dying and death',
 544: 'sailboat',
 10013: 'death star',
 11195: 'empire',
 4270: 'galaxy',
 7376: 'princess',
 10084: 'rescue',
 11196: 'rebellion',
 4932: 'farm',
 1589: 'sniper',
 9678: 'mind control',
 2964: 'future',
 434: 'destruction of a civilization',
 803: 'android',
 8122: 'canyon',
 4271: 'hermit',
 6917: 'epic',
 1612: 'spacecraft',
 5419: 'bootlegger',
 2546: 'mask',
 1357: 'fish',
 658: 'sea',
 5656: 'sydney, australia',
 5657: 'australia',
 2957: 'vietnam war',
 422: 'vietnam veteran',
 791: 'mentally disabled',
 5136: 'uniform',
 1525: 'puberty',
 2918: 'estate agent',
 2919: 'rose garden',
 2337: 'pedophilia',
 5600: 'daughter',
 2921: 'school friend',
 2922:

### Request keywords movie function
The function return a list of keywords for one movie.

In [106]:
import requests

def movie_keywords(movie_id: int):
    # Api data and movie index example
    api_key = "44f9a7f09387a49408460a6d158e1f44"

    # url using movie id = 2 example
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "/keywords?api_key=" + api_key

    # Request keywords info
    request = requests.get(url)
    
    if request.status_code == 200:
        request = json.loads(request.text)
        if request.get("keywords") != None:
            word_list = []
            for line in request.get("keywords"):
                word_list.append(line["name"])
            return word_list

## Looking for top 100 keywords in all movies
We extract keywords for aleatory movies, we will use 100000 movies size sample to obtain a top 100 of keywords and adding to "dataset_movies.csv".

### Iterative configuration

In [114]:
# Iteration parameters
n = len(df_movies)  # All movies size

nbatch_it = 100   # Number of batches to process in iteration
batch_size = 100    # Number of movies per batch
nmovies = batch_size*nbatch_it  # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:", nbatch_it)
print("Batch size is:", batch_size)
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 100
Batch size is: 100
Iteration will process 10000 movies


### Iterative process

In [115]:
import numpy as np
from time import time
import glob

start_time = time()
# Iterate batches
for i in range(nbatch_it+1)[1:]:
    # Create aleatory indexes
    indexs = np.random.randint(2, n, batch_size)  # not 1 because delete csv file

    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, nbatch_it))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')

    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs]:
        
        # Get keywords for movies
        keywords_per_list = movie_keywords(movie_id)
        if keywords_per_list != None:
            # Append keywords to data list
            data_list.extend(keywords_per_list)
            counter += 1
        
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")

    if glob.glob("export_files\keywords_list.csv"):
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.to_csv("export_files\keywords_list.csv", mode="a", header=False)
    else:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list,columns=["keywords"])
        data.to_csv("export_files\keywords_list.csv")

    # del data and clean memory
    del data
    del data_list
    gc.collect()

    # Stop condition
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i, nbatch_it))
        print("movie {}/{} has been processed".format(counter, nmovies))
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i, nbatch_it))
    print("movie {} has been processed".format(counter))
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 1/100.
Processed movies 100/10000. Remaining time: 0.3781 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 2/100.
Processed movies 200/10000. Remaining time: 0.3817 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 3/100.
Processed movies 300/10000. Remaining time: 0.3749 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 4/100.
Processed movies 400/10000. Remaining time: 0.3731 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 5/100.
Processed movies 500/10000. Remaining time: 0.3642 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 6/100.
Processed movies 600/10000. Remaining time: 0.3606 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 7/100.
Processed movies 700/10000. Remain

### Top 100 keywords

In [116]:
keywords = pd.read_csv("export_files\keywords_list.csv")
keywords.drop("Unnamed: 0",axis=1,inplace=True)
top_100_keywords = keywords.value_counts().sort_values(ascending=False)[:100]
top_100_keywords

keywords              
short film                563
woman director            418
based on novel or book    154
murder                    151
concert                   100
                         ... 
world war i                20
gay interest               20
1980s                      19
school                     19
gangster                   19
Length: 100, dtype: int64

In [117]:
temp = []
for keyword in top_100_keywords.index:
    temp.append(keyword[0])
top_100_keywords = temp
del temp
top_100_keywords

['short film',
 'woman director',
 'based on novel or book',
 'murder',
 'concert',
 'silent film',
 'biography',
 'musical',
 'sports',
 'christmas',
 'lgbt',
 'world war ii',
 'stand-up comedy',
 'revenge',
 'softcore',
 'anime',
 'martial arts',
 'philippines',
 'friendship',
 'based on true story',
 'family',
 'love',
 'black and white',
 'sequel',
 'coming of age',
 'politics',
 'wrestling',
 'romance',
 'drugs',
 'horror',
 'parent child relationship',
 'serial killer',
 'ghost',
 'kidnapping',
 'supernatural',
 'monster',
 'pre-code',
 'vampire',
 'high school',
 'slasher',
 'new york city',
 'based on play or musical',
 'police',
 'satire',
 'gay',
 'gore',
 'found footage',
 'holiday',
 'religion',
 'stop motion',
 'opera',
 "rock 'n' roll",
 'erotic movie',
 'rape',
 'prison',
 'nature',
 'dark comedy',
 'art',
 'football (soccer)',
 'japan',
 'remake',
 'zombie',
 'dance',
 'superhero',
 'infidelity',
 'dog',
 'africa',
 'nazi',
 'mexico',
 'mockumentary',
 'racism',
 'surre

### Binarize keywords features

In [93]:
def binary_keyword(movie_id: int):
    row = movie_keywords(movie_id)
    if row != None:
        keywords_by_movie_dict = {}
        keywords_by_movie_dict["id"] = movie_id
            
        for all_key in top_100_keywords:
            keywords_by_movie_dict[all_key] = 0
            for keyword in row:
                if keyword == all_key:
                    keywords_by_movie_dict[all_key] = 1
        return keywords_by_movie_dict

In [98]:
binary_keyword(2)

{'id': 2,
 'short film': 0,
 'woman director': 0,
 'based on novel or book': 0,
 'silent film': 0,
 'murder': 0,
 'sports': 0,
 'revenge': 0,
 'biography': 0,
 'lgbt': 0,
 'anime': 0,
 'musical': 0,
 'coming of age': 0,
 'love': 0,
 'martial arts': 0,
 'stand-up comedy': 0,
 'parody': 0,
 'world war ii': 0,
 'black and white': 0,
 'concert': 0,
 'softcore': 0,
 'parent child relationship': 0,
 'christmas': 0,
 'wrestling': 0,
 'friendship': 0,
 'live performance': 0,
 'drugs': 0,
 'found footage': 0,
 'new york city': 0,
 'philippines': 0,
 'family': 0,
 'rape': 0,
 'stop motion': 0,
 'vampire': 0,
 'prison': 1,
 'based on true story': 0,
 'sequel': 0,
 'police': 0,
 'kidnapping': 0,
 'gay': 0,
 'spy': 0,
 'remake': 0,
 'supernatural': 0,
 'small town': 0,
 'gay interest': 0,
 'based on play or musical': 0,
 '19th century': 0,
 'sibling relationship': 0,
 'high school': 0,
 'romance': 0,
 'football (soccer)': 0,
 'spoof': 0,
 'infidelity': 0,
 'surrealism': 0,
 'road trip': 0,
 'space'

## Iterative keyword features extraction

### Iterative configuration

In [99]:
# Iteration parameters
n = len(df_movies)  # All movies size
nbatch = 4000       # Number of Batches

# Create index partition
indexs = np.linspace(0, len(df_movies), nbatch, dtype=int)
batch_size = indexs[1]

# Initial batch, stop batch and initial counter (movies)
a = 1                 # Initiates on batch a -> [1,nbatch]
b = 100                 # End on batch b -> [1,nbatch]
nbatch_it = b-a+1   # Number of batches to process in iteration
nmovies = batch_size*nbatch_it  # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:", nbatch)
print("Batch size is:", batch_size)
print("Iteration will process from {} to {} batch".format(a, b))
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 4000
Batch size is: 172
Iteration will process from 1 to 100 batch
Iteration will process 17200 movies


### Iterative process

In [100]:
start_time = time()
# Iterate batches
for i in range(len(indexs)+1)[a:(b+1)]:
    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, b))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')

    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs[i-1]:indexs[i]]:
        # Append row to data list
        data_list.append(binary_keyword(movie_id))
        counter += 1
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")

    if i == 1:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list)
        data.set_index("id", inplace=True)
        data.to_csv("export_files\dataset_keywords.csv")
    else:
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.set_index("id", inplace=True)
        data.to_csv("export_files\dataset_keywords.csv", mode="a", header=False)

    # del data and clean memory
    del data
    del data_list
    gc.collect()

    # Stop condition
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i, b))
        print("movie {}/{} has been processed".format(counter, nmovies))
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i, b))
    print("movie {} has been processed".format(counter))
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 1/100.
Processed movies 170/17200. Remaining time: 0.6259 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 2/100.
Processed movies 340/17200. Remaining time: 0.6111 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 3/100.
Processed movies 360/17200. Remaining time: 0.6106 h

KeyboardInterrupt: 

### Join Dataframe

## Extracting collections features

In [None]:
collections = pd.read_json(
    "export_files\collection_ids_03_24_2022.json", lines=True)
collections.set_index("id", inplace=True)
collections

## Extract regions features

In [None]:
import requests
import json
import pandas as pd

# Api data and movie index example
api_key = "44f9a7f09387a49408460a6d158e1f44"

# URL to retrieve movie info
url = "https://api.themoviedb.org/3/watch/providers/regions?api_key=" + \
    str(api_key)+"&language=en-US"

# Making a request example
row = json.loads(requests.get(url).text).get("results")

# Create pandas dataframe
regions = pd.DataFrame(row)
regions.set_index("iso_3166_1", inplace=True)
regions.drop("native_name", axis=1, inplace=True)
regions.to_json("json_dict\regions.json")
regions