# Adding keywords, collections and regions features

## Extracting keywords features

### Import all movies dataframe

In [60]:
df_movies = pd.read_json("json_dict\movie_ids_03_16_2022.json",lines=True)
df_movies.set_index('id',inplace=True)
df_movies.sort_index(inplace=True)
df_movies.head()

Unnamed: 0_level_0,adult,original_title,popularity,video
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,False,Ariel,9.408,False
3,False,Varjoja paratiisissa,10.931,False
5,False,Four Rooms,13.555,False
6,False,Judgment Night,10.25,False
8,False,Life in Loops (A Megacities RMX),3.416,False


### Import all keywords dictionary

In [61]:
import pandas as pd
import json

#keywords = pd.read_json("keyword_ids_03_24_2022.json", lines=True)
#keywords.set_index("id", inplace=True)
keywords_list = []
with open("json_dict\keyword_ids_03_24_2022.json","r",encoding="utf-8") as f:
    for line in f:
        keywords_list.append(json.loads(line))
keywords_dict = {}
for keyword in keywords_list:
    keywords_dict[keyword["id"]] = keyword["name"]
del keywords_list
keywords_dict

{378: 'prison',
 240: 'underdog',
 1787: 'helsinki, finland',
 730: 'factory worker',
 1361: 'salesclerk',
 4529: 'diplomat',
 1008: 'guerrilla warfare',
 612: 'hotel',
 613: "new year's eve",
 616: 'witch',
 622: 'bet',
 2700: 'sperm',
 922: 'hotel room',
 2231: 'drug dealer',
 520: 'chicago, illinois',
 3737: 'dying and death',
 544: 'sailboat',
 10013: 'death star',
 11195: 'empire',
 4270: 'galaxy',
 7376: 'princess',
 10084: 'rescue',
 11196: 'rebellion',
 4932: 'farm',
 1589: 'sniper',
 9678: 'mind control',
 2964: 'future',
 434: 'destruction of a civilization',
 803: 'android',
 8122: 'canyon',
 4271: 'hermit',
 6917: 'epic',
 1612: 'spacecraft',
 5419: 'bootlegger',
 2546: 'mask',
 1357: 'fish',
 658: 'sea',
 5656: 'sydney, australia',
 5657: 'australia',
 2957: 'vietnam war',
 422: 'vietnam veteran',
 791: 'mentally disabled',
 5136: 'uniform',
 1525: 'puberty',
 2918: 'estate agent',
 2919: 'rose garden',
 2337: 'pedophilia',
 5600: 'daughter',
 2921: 'school friend',
 2922:

### Request keywords movie

In [73]:
import requests

def top_keywords(movie_id):
    # Api data and movie index example
    api_key = "44f9a7f09387a49408460a6d158e1f44"

    # url using movie id = 2 example
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "/keywords?api_key=" + api_key

    # Request keywords info
    request = json.loads(requests.get(url).text)
    
    
    word_list = []
    for line in request.get("keywords"):
        word_list.append(line["name"])
    return word_list

['underdog',
 'prison',
 'factory worker',
 'prisoner',
 'helsinki, finland',
 'falling in love']

In [81]:
import numpy as np

# Iteration parameters
n = len(df_movies)  # All movies size
nbatch = 4000       # Number of Batches

# Create index aleatory indexs
indexs = np.random.randint(0,n,100)
batch_size = len(indexs)

# Initial batch, stop batch and initial counter (movies)
a = 1                 # Initiates on batch a -> [1,nbatch]
b = 100                 # End on batch b -> [1,nbatch]
nbatch_it = b-a+1   # Number of batches to process in iteration
nmovies = batch_size*nbatch_it # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:",nbatch)
print("Batch size is:",batch_size)
print("Iteration will process from {} to {} batch".format(a,b))
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 4000
Batch size is: 100
Iteration will process from 1 to 100 batch
Iteration will process 10000 movies


In [82]:
from time import time
import gc

start_time = time()
# Iterate batches
for i in range(len(indexs)+1)[a:(b+1)]:
    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, b))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')
    
    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs]:
        # Append row to data list
        data_list.extend(top_keywords(movie_id))
        counter += 1
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, np.round((time()-start_time)/(3600*counter)*(nmovies-counter),4)), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")
    
    if i == 1:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list)
        #data.set_index("id", inplace=True)
        data.to_csv("export_files\list_words.csv")
    else:
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        #data.set_index("id", inplace=True)
        data.to_csv("export_files\list_words.csv",mode="a",header=False)
    
    # del data and clean memory
    del data
    del data_list
    gc.collect()
    
    # Stop condition    
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i,b))
        print("movie {}/{} has been processed".format(counter,nmovies))
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i,b))
    print("movie {} has been processed".format(counter))
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 1/100.
Processed movies 100/10000. Remaining time: 0.3947 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 2/100.
Processed movies 200/10000. Remaining time: 0.3434 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 3/100.
Processed movies 300/10000. Remaining time: 0.2912 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 4/100.
Processed movies 400/10000. Remaining time: 0.2648 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 5/100.
Processed movies 500/10000. Remaining time: 0.2478 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 6/100.
Processed movies 600/10000. Remaining time: 0.2359 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 7/100.
Processed movies 700/10000. Remain

KeyboardInterrupt: 

### Function to binarize keywords

In [63]:
def binary_keyword(row: list):
    """Convert a list of dictionaries with generes of a movie.
    input: Row is a list of dictionaries."""
    keywords_by_movie_dict = {}
    for values in keywords_dict.values():
        keywords_by_movie_dict[values] = 0
        for keyword in row:
            if keyword["name"] == values:
                keywords_by_movie_dict[values] = 1
    return keywords_by_movie_dict

### Binarize example movie

In [64]:
row_keywords = binary_keyword(request.get("keywords"))
row_keywords

{'prison': 1,
 'underdog': 1,
 'helsinki, finland': 1,
 'factory worker': 1,
 'salesclerk': 0,
 'diplomat': 0,
 'guerrilla warfare': 0,
 'hotel': 0,
 "new year's eve": 0,
 'witch': 0,
 'bet': 0,
 'sperm': 0,
 'hotel room': 0,
 'drug dealer': 0,
 'chicago, illinois': 0,
 'dying and death': 0,
 'sailboat': 0,
 'death star': 0,
 'empire': 0,
 'galaxy': 0,
 'princess': 0,
 'rescue': 0,
 'rebellion': 0,
 'farm': 0,
 'sniper': 0,
 'mind control': 0,
 'future': 0,
 'destruction of a civilization': 0,
 'android': 0,
 'canyon': 0,
 'hermit': 0,
 'epic': 0,
 'spacecraft': 0,
 'bootlegger': 0,
 'mask': 0,
 'fish': 0,
 'sea': 0,
 'sydney, australia': 0,
 'australia': 0,
 'vietnam war': 0,
 'vietnam veteran': 0,
 'mentally disabled': 0,
 'uniform': 0,
 'puberty': 0,
 'estate agent': 0,
 'rose garden': 0,
 'pedophilia': 0,
 'daughter': 0,
 'school friend': 0,
 'suburbian idyll': 0,
 'adultery': 0,
 'coming out': 0,
 'depression': 0,
 'lolita': 0,
 'first time': 0,
 'sexual identity': 0,
 'virgin': 0

### Function to extract info

In [65]:
#Process request function
def process_request(movie_id):
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "/keywords?api_key=" + api_key
        
    request = requests.get(url)
    if request.status_code == 200:
        movie = json.loads(requests.get(url).text)
        movie_keywords = binary_keyword(movie["keywords"])
        movie_keywords["id"] = movie_id
        return movie_keywords

### Configure iterative process

In [66]:
import numpy as np

# Iteration parameters
n = len(df_movies)  # All movies size
nbatch = 4000       # Number of Batches

# Create index partition
indexs = np.linspace(0, len(df_movies), nbatch, dtype=int)
batch_size = indexs[1]

# Initial batch, stop batch and initial counter (movies)
a = 6                 # Initiates on batch a -> [1,nbatch]
b = 10                 # End on batch b -> [1,nbatch]
nbatch_it = b-a+1   # Number of batches to process in iteration
nmovies = batch_size*nbatch_it # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:",nbatch)
print("Batch size is:",batch_size)
print("Iteration will process from {} to {} batch".format(a,b))
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 4000
Batch size is: 172
Iteration will process from 6 to 10 batch
Iteration will process 860 movies


### Iterative process

In [68]:
from time import time
import gc

start_time = time()
# Iterate batches
for i in range(len(indexs)+1)[a:(b+1)]:
    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, b))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')
    
    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs[i-1]:indexs[i]]:
        # Append row to data list
        data_list.append(process_request(movie_id))
        counter += 1
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, np.round((time()-start_time)/(3600*counter)*(nmovies-counter),4)), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")
    
    if i == 1:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list)
        data.set_index("id", inplace=True)
        data.to_csv("export_files\dataset_movies_keywords.csv")
    else:
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.set_index("id", inplace=True)
        data.to_csv("export_files\dataset_movies_keywords.csv",mode="a",header=False)
    
    # del data and clean memory
    del data
    del data_list
    gc.collect()
    
    # Stop condition    
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i,b))
        print("movie {}/{} has been processed".format(counter,nmovies))
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i,b))
    print("movie {} has been processed".format(counter))
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 6/10.
Processed movies 190/860. Remaining time: 0.0421 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 7/10.
Processed movies 360/860. Remaining time: 0.0565 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 8/10.
Processed movies 530/860. Remaining time: 0.0431 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 9/10.
Processed movies 700/860. Remaining time: 0.0223 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 10/10.
Processed movies 880/860. Remaining time: -0.0029 h
Number of error lines: 0
------------------------------------------------------
Iteration process complete
batch 10/10 has been processed
movie 880 has been processed
Elapsed time: 0.14171439932452307 h.


### Reading keywords movies dataframe

In [70]:
df = pd.read_csv("export_files\dataset_movies_keywords.csv")
df

Unnamed: 0,id,prison,underdog,"helsinki, finland",factory worker,salesclerk,diplomat,guerrilla warfare,hotel,new year's eve,...,broken marriage,秀才遇到兵,春江英雄,春江英雄之秀才遇到兵,xiucai encountered,adipapam,aadhya papam,original sin,first sin,abhilasha
0,2,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1742,2459,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1743,2460,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1744,2462,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1745,2463,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
df.sum().sort_values(ascending=False).head(50)

id                           2106420
based on novel or book           148
loss of loved one                114
murder                           113
parent child relationship        108
new york city                     91
woman director                    90
friendship                        82
dying and death                   80
los angeles, california           78
paris, france                     77
sibling relationship              76
daughter                          64
cult film                         63
love of one's life                61
revenge                           61
london, england                   59
police                            56
dystopia                          54
black and white                   53
love                              51
prison                            48
jealousy                          47
neo-noir                          47
adultery                          46
extramarital affair               45
husband wife relationship         45
d

## Extracting collections features

In [None]:
collections = pd.read_json("export_files\collection_ids_03_24_2022.json", lines=True)
collections.set_index("id", inplace=True)
collections

## Extract regions features

In [None]:
import requests
import json
import pandas as pd

# Api data and movie index example
api_key = "44f9a7f09387a49408460a6d158e1f44"

# URL to retrieve movie info
url = "https://api.themoviedb.org/3/watch/providers/regions?api_key=" + \
    str(api_key)+"&language=en-US"

# Making a request example
row = json.loads(requests.get(url).text).get("results")

# Create pandas dataframe
regions = pd.DataFrame(row)
regions.set_index("iso_3166_1", inplace=True)
regions.drop("native_name", axis=1, inplace=True)
regions

In [None]:
regions.to_json("json_dict\regions.json")