# Part 3A - Planning & Preparing Data

- 01/26/24

- NEED TO PREPARE/PLAN FOR FULL APP WITH TABULAR DATA + REVIEWS + MANUSCRIPTS.

### Prep TMDB API Data (Was Split Across Several Notebooks)

In [1]:
## Importing custom function for project
%load_ext autoreload
%autoreload 2
import custom_functions as fn

import os,glob,json
from pprint import pprint
import pandas as pd
pd.set_option("display.max_columns",200)

In [2]:
# !pip install -U dojo_ds -q
# import dojo_ds as ds
# ds.__version__

## Setting Filepaths Config Files

>- Filepath to-dos:
>    - Add names of model insights figures to FPATHS     

In [3]:
import json, os
from pprint import pprint

# Define filename for project config filepaths json file
FPATHS_FILE = "config/filepaths.json"
os.makedirs(os.path.dirname(FPATHS_FILE), exist_ok=True)

# Define Filepaths
FPATHS = dict(
    data={
        "raw": {
            # Combined TMDB API Results
            "combined-tmdb-movie-data_csv":"Data/combined_tmdb_api_data.csv.gz",
            # Movie Reviews (no movie data or categories
            "movie-reviews-tmdb_csv": "Data-NLP/tmdb-movie-reviews.csv.gz",
            # TMDB Movie Info to Use for...NLP? (Why save this separate? )
            "cleaned-tmdb-movie-info-tmdb_csv": "Data-NLP/tmdb-movie-info.csv.gz", 
            # Combined Movie Reviews with financial data
            "reviews-with-movie-info_json": "Data-NLP/combined-tmdb-movie-reviews-with-info.json", 
            # "movie-reviews-with-info_json":
            # ...??
            # "eda": "Data-NLP/eda-movie-reviews.csv.gz",
            "processed-reviews-spacy_joblib": "Data-NLP/processed-nlp-reviews.joblib",
            "processed-reviews-spacy_json": "Data-NLP/processed-nlp-reviews.json",
        },
        "app": {
            # Version of tabular data to load into streamlit app (unless decide to do AWS)
            "movie-data_csv": "app-assets/movie-data-streamlit.csv", 
            # "movie-reviews": ???:
                # Same as raw>movie-reviews-with-info-json?
                # or Same as ml>'reviews-with-target-json'?? 
        },
        "ml-nlp": {
            # Final dataframe of machine learning for NLP ( Use Data-NLP Folder for these models)
            "reviews-with-target_json": "Data-NLP/modeling/processed-nlp-reviews-for-ml.json",
            "train_joblib": "Data-NLP/modeling/training-data.joblib",  # (X_train,y_train)
            "test_joblib": "Data-NLP/modeling/testing-data.joblib",  # (X_test,y_test)
        },
        "ml-tabular": {
            # Final dataframe of machine learning ( Use Data Folder for these models)
            "movie-info-with-ml-target_json": "Data/modeling/processed-movie-data-for-ml.json", # "Data-NLP/modeling/processed-nlp-reviews-for-ml.json",
            "train_joblib": "Data/modeling/training-data.joblib",  # (X_train,y_train)
            "test_joblib": "Data/modeling/testing-data.joblib",  # (X_test,y_test)
        },
        "nn": {
            "train_dir": "Data/modeling/training-data-tf/",  # train_ds
            "test_dir": "Data/modeling/testing-data-tf/",  # test_ds
        },
    },
    images={
        "banner": "images/app-banner.png",
    },
    # # Additional metadata (target lookup,etc.)
    metadata={
        "target_lookup": "Data-NLP/target-lookup.json",
    },
    # Any images to be displayed in the app
    eda={
        "wordclouds-by-roi_png": "images/wordclouds-compare-roi.png",
        "wordclouds-by-rating_png": "images/wordclouds-compare-rating.png",
        "scattertext-by-roi_html": "app-assets/scattertext-roi.html",
        "scattertext-by-rating_html": "app-assets/scattertext-rating.html",

    },
    models={
        # Machine Learning Models and results
        "ml": {
            "bayes_joblib": "Models/bayes-clf.joblib",
            "random_forest_joblib": "Models/random-forest.joblib",
            "logreg_joblib": "Models/log-reg.joblib",
        },
        # Neural networks and results
        "nn": {
            "LSTM_dir": "Models/keras/lstm/",
            "GRU_dir": "Models/keras/gru/",
            "Attention_model_dir": "Models/keras/attn/",
        },
    },
)
# Use fn for local package, ds for pip version
fn.utils.create_directories_from_paths(FPATHS)
# ds.utils.create_directories_from_paths(FPATHS)

print('[i] FPATHS Dictionary:\n')
pprint(FPATHS)

## Save the filepaths
with open(FPATHS_FILE, "w") as f:
    json.dump(FPATHS, f)
    print(f"\n[i] Saved FPATHS to {FPATHS_FILE}")

[i] FPATHS Dictionary:

{'data': {'app': {'movie-data_csv': 'app-assets/movie-data-streamlit.csv'},
          'ml-nlp': {'reviews-with-target_json': 'Data-NLP/modeling/processed-nlp-reviews-for-ml.json',
                     'test_joblib': 'Data-NLP/modeling/testing-data.joblib',
                     'train_joblib': 'Data-NLP/modeling/training-data.joblib'},
          'ml-tabular': {'movie-info-with-ml-target_json': 'Data/modeling/processed-movie-data-for-ml.json',
                         'test_joblib': 'Data/modeling/testing-data.joblib',
                         'train_joblib': 'Data/modeling/training-data.joblib'},
          'nn': {'test_dir': 'Data/modeling/testing-data-tf/',
                 'train_dir': 'Data/modeling/training-data-tf/'},
          'raw': {'cleaned-tmdb-movie-info-tmdb_csv': 'Data-NLP/tmdb-movie-info.csv.gz',
                  'combined-tmdb-movie-data_csv': 'Data/combined_tmdb_api_data.csv.gz',
                  'movie-reviews-tmdb_csv': 'Data-NLP/tmdb-movie-re

In [4]:
with open(FPATHS_FILE) as f:
    TEST = json.load(f)
# pprint(TEST)
TEST

{'data': {'raw': {'combined-tmdb-movie-data_csv': 'Data/combined_tmdb_api_data.csv.gz',
   'movie-reviews-tmdb_csv': 'Data-NLP/tmdb-movie-reviews.csv.gz',
   'cleaned-tmdb-movie-info-tmdb_csv': 'Data-NLP/tmdb-movie-info.csv.gz',
   'reviews-with-movie-info_json': 'Data-NLP/combined-tmdb-movie-reviews-with-info.json',
   'processed-reviews-spacy_joblib': 'Data-NLP/processed-nlp-reviews.joblib',
   'processed-reviews-spacy_json': 'Data-NLP/processed-nlp-reviews.json'},
  'app': {'movie-data_csv': 'app-assets/movie-data-streamlit.csv'},
  'ml-nlp': {'reviews-with-target_json': 'Data-NLP/modeling/processed-nlp-reviews-for-ml.json',
   'train_joblib': 'Data-NLP/modeling/training-data.joblib',
   'test_joblib': 'Data-NLP/modeling/testing-data.joblib'},
  'ml-tabular': {'movie-info-with-ml-target_json': 'Data/modeling/processed-movie-data-for-ml.json',
   'train_joblib': 'Data/modeling/training-data.joblib',
   'test_joblib': 'Data/modeling/testing-data.joblib'},
  'nn': {'train_dir': 'Data/m

In [5]:
TEST['data'].keys()

dict_keys(['raw', 'app', 'ml-nlp', 'ml-tabular', 'nn'])

## Combining and Cleaning TMDB API Data 
**(Also Done @ End of API Calls in Notebook 2A))**

In [6]:
## Should show students glob
import glob
FOLDER = "Data/"
q  = f"{FOLDER}final*.csv.gz"
files = sorted(glob.glob(q))
files

['Data/final_tmdb_data_2000.csv.gz',
 'Data/final_tmdb_data_2001.csv.gz',
 'Data/final_tmdb_data_2002.csv.gz',
 'Data/final_tmdb_data_2003.csv.gz',
 'Data/final_tmdb_data_2004.csv.gz',
 'Data/final_tmdb_data_2005.csv.gz',
 'Data/final_tmdb_data_2006.csv.gz',
 'Data/final_tmdb_data_2007.csv.gz',
 'Data/final_tmdb_data_2008.csv.gz',
 'Data/final_tmdb_data_2009.csv.gz',
 'Data/final_tmdb_data_2010.csv.gz',
 'Data/final_tmdb_data_2011.csv.gz',
 'Data/final_tmdb_data_2012.csv.gz',
 'Data/final_tmdb_data_2013.csv.gz',
 'Data/final_tmdb_data_2014.csv.gz',
 'Data/final_tmdb_data_2015.csv.gz',
 'Data/final_tmdb_data_2016.csv.gz',
 'Data/final_tmdb_data_2017.csv.gz',
 'Data/final_tmdb_data_2018.csv.gz',
 'Data/final_tmdb_data_2019.csv.gz',
 'Data/final_tmdb_data_2020.csv.gz',
 'Data/final_tmdb_data_2021.csv.gz',
 'Data/final_tmdb_data_2022.csv.gz',
 'Data/final_tmdb_data_2023.csv.gz',
 'Data/final_tmdb_data_2024.csv.gz']

In [7]:
# df = pd.concat([pd.read_csv(f, lineterminator='\n') for f in files] )
df_list =  []
files_error = []
for f in files:
    try:
        temp_df = pd.read_csv(f,lineterminator='\n')
        df_list.append(temp_df)
    except Exception as e:
        print(f"[!] Error for {f}")
        display(e)
        files_error.append(f)
        
# df = pd.concat([pd.read_csv(f, lineterminator='\n') for f in files] )
df = pd.concat(df_list)
df = df.reset_index(drop=True)
df
# df

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0,86,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.500,22,
1,False,,,0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0,100,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,
2,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0,165,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,5.000,2,
3,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.940,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,14204632,99,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG
4,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-02,0,99,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.600,56,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93054,False,/dyQvD0BDlWk187fDBmJTU2uUVGH.jpg,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",https://www.todiealonefilm.com,1167732,tt8736506,en,To Die Alone,After suffering a terrible injury while hiking...,5.474,/d7rHwkRlCHhVGBTQQkvGALeC5oz.jpg,"[{'id': 114245, 'logo_path': None, 'name': 'Gl...","[{'iso_3166_1': 'US', 'name': 'United States o...",2024-02-10,0,84,"[{'english_name': 'English', 'iso_639_1': 'en'...",In Production,,To Die Alone,False,0.000,0,NR
93055,False,,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",,560016,tt9214772,en,Monkey Man,An unlikely hero emerges from prison to take o...,7.925,,"[{'id': 3528, 'logo_path': '/cCzCClIzIh81Fa79h...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2024-01-22,0,0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Post Production,,Monkey Man,False,0.000,0,
93056,False,,,0,"[{'id': 18, 'name': 'Drama'}]",,1163894,tt9357860,en,The Hopeful,Aboard a steamship sailing across the Atlantic...,2.594,/9zRA1Vefx1gJdt6fKPDt4JJ6FGC.jpg,"[{'id': 206035, 'logo_path': None, 'name': 'Ad...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2024-02-14,0,90,"[{'english_name': 'English', 'iso_639_1': 'en'...",Post Production,The end of the world is just the beginning.,The Hopeful,False,0.000,0,
93057,False,/C28T7GAlCJQFWVDB4vUV8eZyf9.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,1184693,tt9680562,mr,सत्यशोधक,The film follows the life of social reformer a...,4.889,/jzQPp0oSYPgu8crP3aq8EIjQ6Ti.jpg,"[{'id': 209731, 'logo_path': None, 'name': 'Sa...","[{'iso_3166_1': 'IN', 'name': 'India'}]",2024-01-05,0,145,"[{'english_name': 'Marathi', 'iso_639_1': 'mr'...",Released,HE BRINGS THE TORCH OF REVOLUTION TO ENLIGHTEN...,Satyashodhak,False,0.000,0,


In [8]:
## drop placeholder imdb ids of 0
df = df.loc[ df['imdb_id']!='0']
df

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0,86,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.500,22,
1,False,,,0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0,100,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,
2,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0,165,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,5.000,2,
3,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.940,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,14204632,99,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG
4,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-02,0,99,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.600,56,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93054,False,/dyQvD0BDlWk187fDBmJTU2uUVGH.jpg,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",https://www.todiealonefilm.com,1167732,tt8736506,en,To Die Alone,After suffering a terrible injury while hiking...,5.474,/d7rHwkRlCHhVGBTQQkvGALeC5oz.jpg,"[{'id': 114245, 'logo_path': None, 'name': 'Gl...","[{'iso_3166_1': 'US', 'name': 'United States o...",2024-02-10,0,84,"[{'english_name': 'English', 'iso_639_1': 'en'...",In Production,,To Die Alone,False,0.000,0,NR
93055,False,,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",,560016,tt9214772,en,Monkey Man,An unlikely hero emerges from prison to take o...,7.925,,"[{'id': 3528, 'logo_path': '/cCzCClIzIh81Fa79h...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2024-01-22,0,0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Post Production,,Monkey Man,False,0.000,0,
93056,False,,,0,"[{'id': 18, 'name': 'Drama'}]",,1163894,tt9357860,en,The Hopeful,Aboard a steamship sailing across the Atlantic...,2.594,/9zRA1Vefx1gJdt6fKPDt4JJ6FGC.jpg,"[{'id': 206035, 'logo_path': None, 'name': 'Ad...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2024-02-14,0,90,"[{'english_name': 'English', 'iso_639_1': 'en'...",Post Production,The end of the world is just the beginning.,The Hopeful,False,0.000,0,
93057,False,/C28T7GAlCJQFWVDB4vUV8eZyf9.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,1184693,tt9680562,mr,सत्यशोधक,The film follows the life of social reformer a...,4.889,/jzQPp0oSYPgu8crP3aq8EIjQ6Ti.jpg,"[{'id': 209731, 'logo_path': None, 'name': 'Sa...","[{'iso_3166_1': 'IN', 'name': 'India'}]",2024-01-05,0,145,"[{'english_name': 'Marathi', 'iso_639_1': 'mr'...",Released,HE BRINGS THE TORCH OF REVOLUTION TO ENLIGHTEN...,Satyashodhak,False,0.000,0,


In [9]:
df['Release Year'] = df['release_date'].apply(lambda x: float(str(x).split('-')[0]))
df

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification,Release Year
0,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0,86,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.500,22,,2000.0
1,False,,,0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0,100,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,,2000.0
2,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0,165,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,5.000,2,,2000.0
3,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.940,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,14204632,99,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG,2000.0
4,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-02,0,99,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.600,56,R,2000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93054,False,/dyQvD0BDlWk187fDBmJTU2uUVGH.jpg,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",https://www.todiealonefilm.com,1167732,tt8736506,en,To Die Alone,After suffering a terrible injury while hiking...,5.474,/d7rHwkRlCHhVGBTQQkvGALeC5oz.jpg,"[{'id': 114245, 'logo_path': None, 'name': 'Gl...","[{'iso_3166_1': 'US', 'name': 'United States o...",2024-02-10,0,84,"[{'english_name': 'English', 'iso_639_1': 'en'...",In Production,,To Die Alone,False,0.000,0,NR,2024.0
93055,False,,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",,560016,tt9214772,en,Monkey Man,An unlikely hero emerges from prison to take o...,7.925,,"[{'id': 3528, 'logo_path': '/cCzCClIzIh81Fa79h...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",2024-01-22,0,0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Post Production,,Monkey Man,False,0.000,0,,2024.0
93056,False,,,0,"[{'id': 18, 'name': 'Drama'}]",,1163894,tt9357860,en,The Hopeful,Aboard a steamship sailing across the Atlantic...,2.594,/9zRA1Vefx1gJdt6fKPDt4JJ6FGC.jpg,"[{'id': 206035, 'logo_path': None, 'name': 'Ad...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2024-02-14,0,90,"[{'english_name': 'English', 'iso_639_1': 'en'...",Post Production,The end of the world is just the beginning.,The Hopeful,False,0.000,0,,2024.0
93057,False,/C28T7GAlCJQFWVDB4vUV8eZyf9.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,1184693,tt9680562,mr,सत्यशोधक,The film follows the life of social reformer a...,4.889,/jzQPp0oSYPgu8crP3aq8EIjQ6Ti.jpg,"[{'id': 209731, 'logo_path': None, 'name': 'Sa...","[{'iso_3166_1': 'IN', 'name': 'India'}]",2024-01-05,0,145,"[{'english_name': 'Marathi', 'iso_639_1': 'mr'...",Released,HE BRINGS THE TORCH OF REVOLUTION TO ENLIGHTEN...,Satyashodhak,False,0.000,0,,2024.0


#### Save CSV Of combined api results 

In [10]:
fname_out= FPATHS['data']['raw']["combined-tmdb-movie-data_csv"]
fname_out

'Data/combined_tmdb_api_data.csv.gz'

In [11]:
## Saving combined data 
# fname =FOLDER+'combined_tmdb_api_data.csv.gz'
df.to_csv(fname_out,compression='gzip',index=False)

In [12]:
df = pd.read_csv(FPATHS['data']['raw']['combined-tmdb-movie-data_csv'], lineterminator="\n", low_memory=False)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93059 entries, 0 to 93058
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  93059 non-null  bool   
 1   backdrop_path          57215 non-null  object 
 2   belongs_to_collection  5218 non-null   object 
 3   budget                 93059 non-null  int64  
 4   genres                 93059 non-null  object 
 5   homepage               25567 non-null  object 
 6   id                     93059 non-null  int64  
 7   imdb_id                93059 non-null  object 
 8   original_language      93059 non-null  object 
 9   original_title         93059 non-null  object 
 10  overview               91122 non-null  object 
 11  popularity             93059 non-null  float64
 12  poster_path            84711 non-null  object 
 13  production_companies   93059 non-null  object 
 14  production_countries   93059 non-null  object 
 15  re

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification,Release Year
0,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0,86,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.5,22,,2000.0
1,False,,,0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0,100,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,,2000.0
2,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0,165,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,5.0,2,,2000.0
3,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.94,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,14204632,99,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG,2000.0
4,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-02,0,99,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.6,56,R,2000.0


### Continue Cleaning Api Results for Genre/Production Company as strings

In [13]:
## Function to get just the genre names as a list 
import json
def get_genre_name(x, as_str=True,sep="; "):
    x = x.replace("'",'"')
    x = json.loads(x)
    
    genres = []
    for genre in x:
        genres.append(genre['name'].strip())

    if as_str==True:
        genres = f"{sep}".join(genres)
    return genres

In [14]:
get_genre_name(df.loc[0,'genres'], as_str=True)

'Comedy; Music; Romance'

In [15]:
df['genre_names'] = df['genres'].map(lambda x: get_genre_name(x, as_str=True))
df['genre_names']

0        Comedy; Music; Romance
1               Science Fiction
2          Drama; Action; Crime
3                Drama; Romance
4                         Drama
                  ...          
93054           Thriller; Drama
93055           Thriller; Drama
93056                     Drama
93057            Drama; History
93058          Horror; Thriller
Name: genre_names, Length: 93059, dtype: object

In [16]:
def get_prod_company_names(x,if_missing='', as_str=True,sep="; "):

    import re
    if x=='[]':
        companies = [if_missing]
    else:
        exp= r"\'name\'\:.?\'(\w*.*?)\'"
        companies = re.findall(exp, x)

    if as_str==True:
        companies = f"{sep}".join(companies)
    return companies

In [17]:
df['production_companies_names'] = df['production_companies'].map(lambda x: get_prod_company_names(x, as_str=True))
df['production_companies_names']                                                                  

0        Sullivan Street Productions; Michael Ritchie P...
1        Dimension Films; Grand Design Entertainment; M...
2                                                         
3        Block 2 Pictures; Orly Films; Jet Tone Films; ...
4                   Chal Productions; The Shooting Gallery
                               ...                        
93054           Glass Creek Films; Charming Stranger Films
93055    Thunder Road; 87Eleven; Bron Studios; Creative...
93056    Advent Hope Productions; Hope Studios; Kyle Po...
93057    Samata Films; Abhita Films Production Pvt. Ltd...
93058                Atomic Monster; Blumhouse Productions
Name: production_companies_names, Length: 93059, dtype: object

In [18]:
df['production_companies_names'].value_counts(dropna=False)

                                                                                                                       33826
The Asylum                                                                                                                97
Star Cinema – ABS-CBN Film Productions                                                                                    95
Maverick Entertainment Group                                                                                              84
TOHO                                                                                                                      77
                                                                                                                       ...  
Interwoven Studios                                                                                                         1
Star Com Productions                                                                                                       1


### Collections

In [19]:
df['belongs_to_collection'].value_counts(dropna=False)

NaN                                                                                                                                                               87841
{'id': 39199, 'name': 'Detective Conan Collection', 'poster_path': '/bV6EHK0Q65hHKSoVDeACbc960jQ.jpg', 'backdrop_path': '/mwz7lYimh8da0zZHOI41xNd86yH.jpg'}          23
{'id': 1035073, 'name': 'Exhibition on Screen Collection', 'poster_path': '/eXHVT6aubcscvIG6ORlLxtbKDfy.jpg', 'backdrop_path': None}                                 23
{'id': 148065, 'name': 'Doraemon Collection', 'poster_path': '/4TLSP1KD1uAlp2q1rTrc6SFlktX.jpg', 'backdrop_path': '/rc6OFcSasL5YxBRPUQVwxmVF6h5.jpg'}                18
{'id': 403643, 'name': 'Troublesome Night Collection', 'poster_path': '/bPTx3TP4UJTHQfcLx4qIub9LXmi.jpg', 'backdrop_path': '/n3a7zF5GuxM2X8oPF6pKXqYS6ER.jpg'}       15
                                                                                                                                                                

In [20]:
';'.join(['Testing'])

'Testing'

In [21]:
# Filterin for example collection
filter_notna = df['belongs_to_collection'].notna()
idx_notna = df[filter_notna].index
# df.loc[idx_notna]

test_collection = df.loc[idx_notna[0],'belongs_to_collection']
test_collection

"{'id': 141086, 'name': 'Heavy Metal Collection', 'poster_path': '/tgPpYcsjSo1DK0wublqYItYDwSW.jpg', 'backdrop_path': '/iao9hIahX41T1Lxpa5h62J28rQF.jpg'}"

In [22]:
import ast, json
ast.literal_eval(test_collection)

{'id': 141086,
 'name': 'Heavy Metal Collection',
 'poster_path': '/tgPpYcsjSo1DK0wublqYItYDwSW.jpg',
 'backdrop_path': '/iao9hIahX41T1Lxpa5h62J28rQF.jpg'}

In [23]:
## Function to get just the genre names as a list 

def get_collection(x, if_missing=None, as_str=True,sep="; "):
    if not isinstance(x,str):
        return if_missing
        
    import ast
    x = ast.literal_eval(x)
    collection = x.get('name',None)
    return collection


In [24]:
get_collection(test_collection)

'Heavy Metal Collection'

In [25]:
## Apply the function to the entire column
df['collection_name'] = df['belongs_to_collection'].apply(get_collection)
df['collection_name'].value_counts(dropna=False)

None                                        87841
Detective Conan Collection                     23
Exhibition on Screen Collection                23
Doraemon Collection                            18
Troublesome Night Collection                   15
                                            ...  
JL Family Ranch Collection                      1
¿Usted No Sabe Quien Soy Yo? - Colección        1
Anděl Páně (kolekce)                            1
Happy Bhag Jayegi Collection                    1
PVCU                                            1
Name: collection_name, Length: 2568, dtype: int64

In [26]:
df['belongs_to_collection'].isna().sum()

87841

### Countries

In [27]:

def get_countries(x, name_or_abbrev='abbrev',as_str=False,sep="; "):
    import ast
    x = ast.literal_eval(x)

    if  name_or_abbrev=='abbrev':
        key = 'iso_3166_1'
    else: 
        key='name'

    # Loop to get each countries key
    countries = []
    for country in x:
        countries.append(country.get(key,'').strip())

    if as_str==True:
        countries = f"{sep}".join(countries)
        
    return countries
  


In [28]:
df['production_countries_iso'] = df['production_countries'].map(lambda x: get_countries(x, name_or_abbrev='abbrev',as_str=True))
df['production_countries_iso'].value_counts(dropna=False)

US            24992
              22975
IN             3540
JP             3001
GB             2689
              ...  
EE; SE            1
TR; BA            1
IE; ZA; GB        1
AU; SG            1
BR; IN            1
Name: production_countries_iso, Length: 4722, dtype: int64

In [29]:
df['production_countries_name'] = df['production_countries'].map(lambda x: get_countries(x, name_or_abbrev='name',as_str=True))
df['production_countries_name'].value_counts(dropna=False)

United States of America                                   24992
                                                           22975
India                                                       3540
Japan                                                       3001
United Kingdom                                              2689
                                                           ...  
Turkey; Bosnia and Herzegovina                                 1
Ireland; South Africa; United Kingdom                          1
Australia; Singapore                                           1
Serbia; Spain; United Kingdom; United States of America        1
Brazil; India                                                  1
Name: production_countries_name, Length: 4721, dtype: int64

### Spoken Languages

In [30]:
df['spoken_languages']

0        [{'english_name': 'English', 'iso_639_1': 'en'...
1        [{'english_name': 'English', 'iso_639_1': 'en'...
2        [{'english_name': 'Hindi', 'iso_639_1': 'hi', ...
3        [{'english_name': 'Cantonese', 'iso_639_1': 'c...
4        [{'english_name': 'English', 'iso_639_1': 'en'...
                               ...                        
93054    [{'english_name': 'English', 'iso_639_1': 'en'...
93055    [{'english_name': 'English', 'iso_639_1': 'en'...
93056    [{'english_name': 'English', 'iso_639_1': 'en'...
93057    [{'english_name': 'Marathi', 'iso_639_1': 'mr'...
93058    [{'english_name': 'English', 'iso_639_1': 'en'...
Name: spoken_languages, Length: 93059, dtype: object

In [31]:

def get_languages(x, name_or_abbrev='abbrev',as_str=True,sep="; "):
    import ast
    x = ast.literal_eval(x)

    if  name_or_abbrev=='abbrev':
        key = 'iso_639_1'
    else: 
        key='english_name'

    # Loop to get each countries key
    languages = []
    for lang in x:
        languages.append(lang.get(key,'').strip())

    if as_str==True:
        languages = f"{sep}".join(languages)
        
    return languages
  


In [32]:
df['spoken_language_iso'] = df['spoken_languages'].map(lambda x: get_languages(x,name_or_abbrev='abbrev'))
df['spoken_language_iso'].value_counts(dropna=False)

en                36080
                  17108
ja                 3006
es                 2722
fr                 2462
                  ...  
fi; tl                1
en; tl; th            1
ro; it; en            1
en; fr; ca; es        1
ms; th                1
Name: spoken_language_iso, Length: 3541, dtype: int64

### ✅Future To-Do Add Financial Calculations Here

## Save Final

In [33]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93059 entries, 0 to 93058
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   adult                       93059 non-null  bool   
 1   backdrop_path               57215 non-null  object 
 2   belongs_to_collection       5218 non-null   object 
 3   budget                      93059 non-null  int64  
 4   genres                      93059 non-null  object 
 5   homepage                    25567 non-null  object 
 6   id                          93059 non-null  int64  
 7   imdb_id                     93059 non-null  object 
 8   original_language           93059 non-null  object 
 9   original_title              93059 non-null  object 
 10  overview                    91122 non-null  object 
 11  popularity                  93059 non-null  float64
 12  poster_path                 84711 non-null  object 
 13  production_companies        930

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification,Release Year,genre_names,production_companies_names,collection_name,production_countries_iso,production_countries_name,spoken_language_iso
0,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,"[{'id': 51207, 'logo_path': None, 'name': 'Sul...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-22,0,86,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,False,5.5,22,,2000.0,Comedy; Music; Romance,Sullivan Street Productions; Michael Ritchie P...,,US,United States of America,en
1,False,,,0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,"[{'id': 7405, 'logo_path': '/rfnws0uY8rsNAsrLb...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-11-15,0,100,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,,2000.0,Science Fiction,Dimension Films; Grand Design Entertainment; M...,,US,United States of America,en
2,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,[],"[{'iso_3166_1': 'IN', 'name': 'India'}]",2000-04-14,0,165,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,False,5.0,2,,2000.0,Drama; Action; Crime,,,IN,India,hi
3,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.94,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",2000-09-29,14204632,99,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG,2000.0,Drama; Romance,Block 2 Pictures; Orly Films; Jet Tone Films; ...,,FR; HK; NL; CN,France; Hong Kong; Netherlands; China,cn; fr; es
4,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,"[{'id': 67930, 'logo_path': None, 'name': 'Cha...","[{'iso_3166_1': 'US', 'name': 'United States o...",2000-09-02,0,99,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.6,56,R,2000.0,Drama,Chal Productions; The Shooting Gallery,,US,United States of America,en


In [34]:
fname_out_final = FPATHS['data']['raw']['cleaned-tmdb-movie-info-tmdb_csv']
fname_out_final

'Data-NLP/tmdb-movie-info.csv.gz'

### Change belongs_to_collection to bool

In [38]:
df['belongs_to_collection'] = df['belongs_to_collection'].notna()#.sum()
df['belongs_to_collection'].value_counts()

False    87841
True      5218
Name: belongs_to_collection, dtype: int64

##### Change to just relocatin the drop columns to index

In [39]:
# Set final dataframe (drop columns if desired)
drop_replaced_cols = ['production_companies',#'belongs_to_collection',
                      'production_countries','genres','spoken_languages']

move_back_of_df_cols = ['poster_path','adult','backdrop_path','homepage']
### EITHER DROP OR REORDER
## to drop:
# df_final = df.drop(columns=drop_replaced_cols, errors='ignore')
## To reorder
final_columns = [*df.drop(columns=[*drop_replaced_cols,*move_back_of_df_cols]).columns, *move_back_of_df_cols]#, *drop_replaced_cols]
df_final = df[final_columns].copy()


df_final.head()

Unnamed: 0,belongs_to_collection,budget,id,imdb_id,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,video,vote_average,vote_count,certification,Release Year,genre_names,production_companies_names,collection_name,production_countries_iso,production_countries_name,spoken_language_iso,poster_path,adult,backdrop_path,homepage
0,False,10000000,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,2000-09-22,0,86,Released,Try to remember the first time magic happened,The Fantasticks,False,5.5,22,,2000.0,Comedy; Music; Romance,Sullivan Street Productions; Michael Ritchie P...,,US,United States of America,en,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,
1,False,0,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,2000-11-15,0,100,Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,,2000.0,Science Fiction,Dimension Films; Grand Design Entertainment; M...,,US,United States of America,en,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,False,,
2,False,0,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,2000-04-14,0,165,Released,,Gang,False,5.0,2,,2000.0,Drama; Action; Crime,,,IN,India,hi,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,
3,False,150000,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.94,2000-09-29,14204632,99,Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG,2000.0,Drama; Romance,Block 2 Pictures; Orly Films; Jet Tone Films; ...,,FR; HK; NL; CN,France; Hong Kong; Netherlands; China,cn; fr; es,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,
4,False,0,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,2000-09-02,0,99,Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.6,56,R,2000.0,Drama,Chal Productions; The Shooting Gallery,,US,United States of America,en,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,


In [40]:
fname_out_final

'Data-NLP/tmdb-movie-info.csv.gz'

In [41]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93059 entries, 0 to 93058
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   belongs_to_collection       93059 non-null  bool   
 1   budget                      93059 non-null  int64  
 2   id                          93059 non-null  int64  
 3   imdb_id                     93059 non-null  object 
 4   original_language           93059 non-null  object 
 5   original_title              93059 non-null  object 
 6   overview                    91122 non-null  object 
 7   popularity                  93059 non-null  float64
 8   release_date                90778 non-null  object 
 9   revenue                     93059 non-null  int64  
 10  runtime                     93059 non-null  int64  
 11  status                      93059 non-null  object 
 12  tagline                     33222 non-null  object 
 13  title                       930

In [42]:
df_final.to_csv(fname_out_final, compression='gzip',index=False)

In [43]:
pd.read_csv(fname_out_final, low_memory=False, lineterminator='\n')

Unnamed: 0,belongs_to_collection,budget,id,imdb_id,original_language,original_title,overview,popularity,release_date,revenue,runtime,status,tagline,title,video,vote_average,vote_count,certification,Release Year,genre_names,production_companies_names,collection_name,production_countries_iso,production_countries_name,spoken_language_iso,poster_path,adult,backdrop_path,homepage
0,False,10000000,62127,tt0113026,en,The Fantasticks,Two rural teens sing and dance their way throu...,2.559,2000-09-22,0,86,Released,Try to remember the first time magic happened,The Fantasticks,False,5.500,22,,2000.0,Comedy; Music; Romance,Sullivan Street Productions; Michael Ritchie P...,,US,United States of America,en,/hfO64mXz3DgUxkBVU7no2UWRP7x.jpg,False,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,
1,False,0,110977,tt0113092,en,For the Cause,Earth is in a state of constant war and two co...,3.393,2000-11-15,0,100,Released,The ultimate showdown on a forbidden planet.,For the Cause,False,4.958,12,,2000.0,Science Fiction,Dimension Films; Grand Design Entertainment; M...,,US,United States of America,en,/h9bWO13nWRGZJo4XVPiElXyrRMU.jpg,False,,
2,False,0,442869,tt0116391,hi,Gang,"After falling prey to underworld, four friends...",2.748,2000-04-14,0,165,Released,,Gang,False,5.000,2,,2000.0,Drama; Action; Crime,,,IN,India,hi,/dYcuiiBDpPUvCcPbiWdH4REjGn3.jpg,False,/krEZg9tb6blhc7sV6Us2ZGQ0gA.jpg,
3,False,150000,843,tt0118694,cn,花樣年華,Two neighbors become intimate after discoverin...,27.940,2000-09-29,14204632,99,Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,False,8.109,2433,PG,2000.0,Drama; Romance,Block 2 Pictures; Orly Films; Jet Tone Films; ...,,FR; HK; NL; CN,France; Hong Kong; Netherlands; China,cn; fr; es,/iYypPT4bhqXfq1b6EnmxvRt6b2Y.jpg,False,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,
4,False,0,49511,tt0118852,en,Chinese Coffee,"When Harry Levine, an aging, unsuccessful Gree...",5.495,2000-09-02,0,99,Released,There's a fine line between friendship and bet...,Chinese Coffee,False,6.600,56,R,2000.0,Drama,Chal Productions; The Shooting Gallery,,US,United States of America,en,/nZGWnSuf1FIuzyEuMRZHHZWViAp.jpg,False,/vceiGZ3uavAEHlTA7v0GjQsGVKe.jpg,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93054,False,0,1167732,tt8736506,en,To Die Alone,After suffering a terrible injury while hiking...,5.474,2024-02-10,0,84,In Production,,To Die Alone,False,0.000,0,NR,2024.0,Thriller; Drama,Glass Creek Films; Charming Stranger Films,,US,United States of America,en,/d7rHwkRlCHhVGBTQQkvGALeC5oz.jpg,False,/dyQvD0BDlWk187fDBmJTU2uUVGH.jpg,https://www.todiealonefilm.com
93055,False,0,560016,tt9214772,en,Monkey Man,An unlikely hero emerges from prison to take o...,7.925,2024-01-22,0,0,Post Production,,Monkey Man,False,0.000,0,,2024.0,Thriller; Drama,Thunder Road; 87Eleven; Bron Studios; Creative...,,CA; US,Canada; United States of America,en; hi,,False,,
93056,False,0,1163894,tt9357860,en,The Hopeful,Aboard a steamship sailing across the Atlantic...,2.594,2024-02-14,0,90,Post Production,The end of the world is just the beginning.,The Hopeful,False,0.000,0,,2024.0,Drama,Advent Hope Productions; Hope Studios; Kyle Po...,,CA,Canada,en,/9zRA1Vefx1gJdt6fKPDt4JJ6FGC.jpg,False,,
93057,False,0,1184693,tt9680562,mr,सत्यशोधक,The film follows the life of social reformer a...,4.889,2024-01-05,0,145,Released,HE BRINGS THE TORCH OF REVOLUTION TO ENLIGHTEN...,Satyashodhak,False,0.000,0,,2024.0,Drama; History,Samata Films; Abhita Films Production Pvt. Ltd...,,IN,India,mr,/jzQPp0oSYPgu8crP3aq8EIjQ6Ti.jpg,False,/C28T7GAlCJQFWVDB4vUV8eZyf9.jpg,


## APPENDIX

### Experimenting with ast.literal_eval

In [40]:
explode_genre_dicts = df['genres'].map(ast.literal_eval).explode()
explode_genre_dicts

0                  {'id': 35, 'name': 'Comedy'}
0                {'id': 10402, 'name': 'Music'}
0              {'id': 10749, 'name': 'Romance'}
1        {'id': 878, 'name': 'Science Fiction'}
2                   {'id': 18, 'name': 'Drama'}
                          ...                  
93056               {'id': 18, 'name': 'Drama'}
93057               {'id': 18, 'name': 'Drama'}
93057             {'id': 36, 'name': 'History'}
93058              {'id': 27, 'name': 'Horror'}
93058            {'id': 53, 'name': 'Thriller'}
Name: genres, Length: 161716, dtype: object

In [41]:
# explode_genre_dicts.map(pd.DataFrame)