# Adding keywords features

### Import all movies dataframe

In [94]:
import pandas as pd
df_movies = pd.read_json("json_dict\movie_ids_03_16_2022.json", lines=True)
df_movies.set_index('id', inplace=True)
df_movies.sort_index(inplace=True)
df_movies.head()

Unnamed: 0_level_0,adult,original_title,popularity,video
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,False,Ariel,9.408,False
3,False,Varjoja paratiisissa,10.931,False
5,False,Four Rooms,13.555,False
6,False,Judgment Night,10.25,False
8,False,Life in Loops (A Megacities RMX),3.416,False


## Extracting keywords features

### Import all keywords dictionary

In [95]:
import json

def get_keywords_dict():
    keywords_list = []
    with open("json_dict\keyword_ids_03_24_2022.json", "r", encoding="utf-8") as f:
        for line in f:
            keywords_list.append(json.loads(line))
    keywords_dict = {}
    for keyword in keywords_list:
        keywords_dict[keyword["id"]] = keyword["name"]
    return keywords_dict

keywords_dict = get_keywords_dict()

### Extraction movie keywords function
The function return a list of keywords for one movie.

In [96]:
import requests
import json

def movie_keywords(movie_id: int):
    # Api data and movie index example
    api_key = "44f9a7f09387a49408460a6d158e1f44"

    # url using movie id = 2 example
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "/keywords?api_key=" + api_key

    # Request keywords info
    request = requests.get(url)
    
    if request.status_code == 200:
        request = json.loads(request.text)
        if request.get("keywords") != None:
            word_list = []
            for line in request.get("keywords"):
                word_list.append(line["name"])
            return word_list

In [97]:
movie_keywords(2)

['underdog',
 'prison',
 'factory worker',
 'prisoner',
 'helsinki, finland',
 'falling in love']

## Iterative extraction features

### Loading top 100 keywords function
Top 100 keywords where extracted in top_100_keywords.ipynb jupyter notebook.

In [98]:
def load_top_keywords(n: int):
    # Loading dataframe
    keywords = pd.read_csv("export_files\keywords_list.csv")
    keywords.drop("Unnamed: 0", axis=1, inplace=True)

    # Take top 100 keywords
    top_100_keywords = keywords.value_counts().sort_values(ascending=False)[:n]

    temp = []
    for keyword in top_100_keywords.index:
        temp.append(keyword[0])
    top_100_keywords = temp
    return top_100_keywords

top_100_keywords = load_top_keywords(100)
top_100_keywords

['short film',
 'woman director',
 'based on novel or book',
 'murder',
 'musical',
 'concert',
 'silent film',
 'biography',
 'sports',
 'stand-up comedy',
 'lgbt',
 'christmas',
 'world war ii',
 'revenge',
 'family',
 'love',
 'anime',
 'philippines',
 'based on true story',
 'martial arts',
 'friendship',
 'romance',
 'coming of age',
 'softcore',
 'black and white',
 'kidnapping',
 'wrestling',
 'opera',
 'new york city',
 'ghost',
 'based on play or musical',
 'sequel',
 'police',
 'politics',
 'serial killer',
 'horror',
 'pre-code',
 'found footage',
 'prison',
 'holiday',
 'erotic movie',
 'drugs',
 'vampire',
 'zombie',
 'rape',
 'parent child relationship',
 'death',
 'gay interest',
 'dance',
 'remake',
 'high school',
 'art',
 'dark comedy',
 'monster',
 'slasher',
 'gay',
 'religion',
 'stop motion',
 'dog',
 'gore',
 'time travel',
 'supernatural',
 'lost film',
 'marriage',
 'alien',
 'football (soccer)',
 'gangster',
 'detective',
 'superhero',
 'nazi',
 'africa',
 'su

### Binarize keywords features function()

In [99]:
#Load Top 100 keywords function before
def binary_keyword(movie_id: int, top_100: list):
    row = movie_keywords(movie_id)
    if row != None:
        keywords_by_movie_dict = {}
        keywords_by_movie_dict["id"] = movie_id
            
        for all_key in top_100:
            keywords_by_movie_dict[all_key] = 0
            for keyword in row:
                if keyword == all_key:
                    keywords_by_movie_dict[all_key] = 1
        return keywords_by_movie_dict

binary_keyword(2, top_100_keywords)

{'id': 2,
 'short film': 0,
 'woman director': 0,
 'based on novel or book': 0,
 'murder': 0,
 'musical': 0,
 'concert': 0,
 'silent film': 0,
 'biography': 0,
 'sports': 0,
 'stand-up comedy': 0,
 'lgbt': 0,
 'christmas': 0,
 'world war ii': 0,
 'revenge': 0,
 'family': 0,
 'love': 0,
 'anime': 0,
 'philippines': 0,
 'based on true story': 0,
 'martial arts': 0,
 'friendship': 0,
 'romance': 0,
 'coming of age': 0,
 'softcore': 0,
 'black and white': 0,
 'kidnapping': 0,
 'wrestling': 0,
 'opera': 0,
 'new york city': 0,
 'ghost': 0,
 'based on play or musical': 0,
 'sequel': 0,
 'police': 0,
 'politics': 0,
 'serial killer': 0,
 'horror': 0,
 'pre-code': 0,
 'found footage': 0,
 'prison': 1,
 'holiday': 0,
 'erotic movie': 0,
 'drugs': 0,
 'vampire': 0,
 'zombie': 0,
 'rape': 0,
 'parent child relationship': 0,
 'death': 0,
 'gay interest': 0,
 'dance': 0,
 'remake': 0,
 'high school': 0,
 'art': 0,
 'dark comedy': 0,
 'monster': 0,
 'slasher': 0,
 'gay': 0,
 'religion': 0,
 'stop mo

### Iterative configuration

In [100]:
import numpy as np

#--------------------------------------------------------------
# Iteration parameters
n = len(df_movies)  # All movies size
nbatch = 4000       # Number of Batches

# Initial batch, stop batch and initial counter (movies)
a = 1005                 # Initiates on batch a -> [1,nbatch]
b = 1006                 # End on batch b -> [1,nbatch]
#--------------------------------------------------------------
# el 1004 ya está en el set, empezar con 1005

# Stop conditions (stop when batch x has been processed)
batch_stop = None      # None for no stop

# Create index partition
indexs = np.linspace(0, len(df_movies), nbatch, dtype=int)
batch_size = indexs[1]

nbatch_it = b-a+1   # Number of batches to process in iteration
nmovies = batch_size*nbatch_it # Total movies in iteration

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:",nbatch)
print("Batch size is:",batch_size)
print("Iteration will process from {} to {} batch".format(a,b))
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 4000
Batch size is: 172
Iteration will process from 1005 to 1006 batch
Iteration will process 344 movies


### Iterative process

In [101]:
import datetime
from time import time
import gc

total_errors = 0    # Count total error
counter = 0         # Count iterations movies
start_time = time()

# Iterate batches
for i in range(len(indexs)+1)[a:(b+1)]:
    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, b))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')
    
    data_list = []
    
    # Iterate movies batch
    for movie_id in df_movies.index[indexs[i-1]:indexs[i]]:
        
        # Append row to data list collection
        data_list.append(binary_keyword(movie_id, top_100_keywords))
        counter += 1
        
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    error = before - len(data_list)
    if error != 0:
        total_errors += 1
    print("\nError lines: {}".format(before - len(data_list)))
    print("------------------------------------------------------")
    
    if i == 1:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list)
        data.set_index("id",inplace=True)
        data.to_csv("export_files\dataset_keywords.csv")
    else:
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.set_index("id",inplace=True)
        data.to_csv("export_files\dataset_keywords.csv",mode="a",header=False)
    
    # del data and clean memory
    del data
    del data_list
    gc.collect()
    
    # Stop condition    
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i,b))
        print("{} movies has been processed with {} errors".format(counter,total_errors))
        print("Process end:", datetime.datetime.now())
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i,b))
    print("{} movies has been processed with {} errors".format(counter,total_errors))
    print("Process end:", datetime.datetime.now())
    print("Elapsed time: {} h.".format((time()-start_time)/3600))

Processing Batch 1005/1006.
Processed movies 170/344. Remaining time: 0.0034 h
Error lines: 0
------------------------------------------------------
Processing Batch 1006/1006.
Processed movies 340/344. Remaining time: 0.0001 h
Error lines: 0
------------------------------------------------------
Iteration process complete
batch 1006/1006 has been processed
344 movies has been processed with 0 errors
Process end: 2022-03-30 04:20:27.228135
Elapsed time: 0.006832752890057034 h.


### Loading dataframe from file

In [102]:
keywords_df = pd.read_csv("export_files\dataset_keywords.csv")
keywords_df

Unnamed: 0,id,short film,woman director,based on novel or book,murder,musical,concert,silent film,biography,sports,...,avant-garde,school,robbery,dutch cabaret,"london, england",1970s,sibling relationship,surrealism,anthology,france
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173472,286244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173473,286245,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173474,286246,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173475,286247,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Removing duplicates and fill None values

In [103]:
print("Number of movies before removing:",keywords_df.shape[0])
keywords_df.drop_duplicates(inplace=True,keep="first")
print("Number of movies after drop duplicates:", keywords_df.shape[0])

Number of movies before removing: 173477
Number of movies after drop duplicates: 173133


In [104]:
keywords_df.set_index("id",inplace=True)
keywords_df

Unnamed: 0_level_0,short film,woman director,based on novel or book,murder,musical,concert,silent film,biography,sports,stand-up comedy,...,avant-garde,school,robbery,dutch cabaret,"london, england",1970s,sibling relationship,surrealism,anthology,france
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286244,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
286245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
286246,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
286247,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Export clean dataset to csv (update)

In [105]:
# Run to save df without duplicates
keywords_df.to_csv("export_files\dataset_keywords.csv")