## Looking for top 100 keywords in all movies
We extract keywords for aleatory movies, we will use 100000 movies size sample to obtain a top 100 of keywords and adding to "dataset_movies.csv".

### Import all movies dataframe

In [None]:
import pandas as pd
df_movies = pd.read_json("json_dict\movie_ids_03_16_2022.json", lines=True)
df_movies.set_index('id', inplace=True)
df_movies.sort_index(inplace=True)
df_movies.head()

Unnamed: 0_level_0,adult,original_title,popularity,video
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,False,Ariel,9.408,False
3,False,Varjoja paratiisissa,10.931,False
5,False,Four Rooms,13.555,False
6,False,Judgment Night,10.25,False
8,False,Life in Loops (A Megacities RMX),3.416,False


### Extraction movie keywords function
The function return a list of keywords for one movie.

In [None]:
import requests
import json

def movie_keywords(movie_id: int):
    # Api data and movie index example
    api_key = "44f9a7f09387a49408460a6d158e1f44"

    # url using movie id = 2 example
    url = "https://api.themoviedb.org/3/movie/" + \
        str(movie_id) + "/keywords?api_key=" + api_key

    # Request keywords info
    request = requests.get(url)
    
    if request.status_code == 200:
        request = json.loads(request.text)
        if request.get("keywords") != None:
            word_list = []
            for line in request.get("keywords"):
                word_list.append(line["name"])
            return word_list

In [None]:
movie_keywords(2)

['underdog',
 'prison',
 'factory worker',
 'prisoner',
 'helsinki, finland',
 'falling in love']

### Iterative configuration

In [None]:
# Iteration parameters
n = len(df_movies)  # All movies size

nbatch_it = 100   # Number of batches to process in iteration
batch_size = 100    # Number of movies per batch
nmovies = batch_size*nbatch_it  # Total movies in iteration
counter = 0         # Count iterations movies

# Stop conditions (stop when batch has been processed)
batch_stop = None      # None for no stop

# Print resume configuration
print("------------ Iteration conditions ------------")
print("Total number of batches:", nbatch_it)
print("Batch size is:", batch_size)
print("Iteration will process {} movies".format(nmovies))
if batch_stop != None:
    print("Stop condition: {} batch".format(batch_stop))

------------ Iteration conditions ------------
Total number of batches: 100
Batch size is: 100
Iteration will process 10000 movies


### Iterative process

In [None]:
import datetime
import numpy as np
from time import time
import glob
import gc

start_time = time()
# Iterate batches
for i in range(nbatch_it+1)[1:]:
    # Create aleatory indexes
    indexs = np.random.randint(2, n, batch_size)  # not 1 because delete csv file

    # Print iteration info
    print('Processing Batch {}/{}.'.format(i, nbatch_it))
    print('Processed movies {}/{}.'.format(counter, nmovies), end='')

    data_list = []
    # Iterate movies batch
    for movie_id in df_movies.index[indexs]:
        
        # Get keywords for movies
        keywords_per_list = movie_keywords(movie_id)
        if keywords_per_list != None:
            # Append keywords to data list
            data_list.extend(keywords_per_list)
            counter += 1
        
        # Print iteration counter and status time every 10 iterations
        if counter % 10 == 0:
            estimate_time = np.round(
                (time()-start_time)/(3600*counter)*(nmovies-counter), 4)
            print('\rProcessed movies {}/{}. Remaining time: {:5.4f} h'.format(counter,
                  nmovies, estimate_time), end='')

    # Remove None values in list
    before = len(data_list)
    data_list = list(filter(None, data_list))
    print("\nNumber of error lines:", before - len(data_list))
    print("------------------------------------------------------")

    if glob.glob("export_files\keywords_list.csv"):
        # Append dataframe to csv file without header
        data = pd.DataFrame(data_list)
        data.to_csv("export_files\keywords_list.csv", mode="a", header=False)
    else:
        # Append dataframe to csv file with header
        data = pd.DataFrame(data_list,columns=["keywords"])
        data.to_csv("export_files\keywords_list.csv")

    # del data and clean memory
    del data
    del data_list
    gc.collect()

    # Stop condition
    if i == batch_stop:
        print("Iteration stop condition")
        print("batch {}/{} has been processed".format(i, nbatch_it))
        print("movie {}/{} has been processed".format(counter, nmovies))
        print("Elapsed time: {} h.".format((time()-start_time)/3600))
        break

if batch_stop == None:
    print("Iteration process complete")
    print("batch {}/{} has been processed".format(i, nbatch_it))
    print("movie {} has been processed".format(counter))
    print("Process end:",datetime.datetime.now())
    print("Elapsed time: {} h.".format((time()-start_time)/3600))


Processing Batch 1/100.
Processed movies 10090/10000. Remaining time: -0.0000 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 2/100.
Processed movies 10190/10000. Remaining time: -0.0001 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 3/100.
Processed movies 10290/10000. Remaining time: -0.0003 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 4/100.
Processed movies 10390/10000. Remaining time: -0.0006 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 5/100.
Processed movies 10490/10000. Remaining time: -0.0009 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 6/100.
Processed movies 10590/10000. Remaining time: -0.0013 h
Number of error lines: 0
------------------------------------------------------
Processing Batch 7/100.
Processed movies

### Top 100 keywords

In [None]:
keywords = pd.read_csv("export_files\keywords_list.csv")
keywords.drop("Unnamed: 0",axis=1,inplace=True)
print("keywords list length:",keywords.shape[0])
top_100_keywords = keywords.value_counts().sort_values(ascending=False)[:100]
del keywords
top_100_keywords

keywords list length: 62521


keywords              
short film                1892
woman director            1555
based on novel or book     511
murder                     451
musical                    327
                          ... 
1970s                       65
sibling relationship        65
surrealism                  64
anthology                   64
france                      64
Length: 100, dtype: int64