# Part 3A - Planning & Preparing Data

- 01/26/24

- NEED TO PREPARE/PLAN FOR FULL APP WITH TABULAR DATA + REVIEWS + MANUSCRIPTS.

### Prep TMDB API Data (Was Split Across Several Notebooks)

In [24]:
## Importing custom function for project
%load_ext autoreload
%autoreload 2
import custom_functions as fn

import os,glob,json
import pandas as pd
from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# !pip install -U dojo_ds -q
# import dojo_ds as ds
# ds.__version__

'1.0.9'

## Setting Filepaths Config Files

In [21]:
import json, os
from pprint import pprint

# Define filename for project config filepaths json file
FPATHS_FILE = "config/filepaths.json"
os.makedirs(os.path.dirname(FPATHS_FILE), exist_ok=True)

# Define Filepaths
FPATHS = dict(
    data={
        "raw": {
            # Movie Reviews (no movie data or categories
            "movie-reviews-tmdb_csv": "Data-NLP/tmdb-movie-reviews.csv.gz",
            # TMDB Movie Info to Use for...NLP? (Why save this separate? )
            "movie-info-tmdb_csv": "Data-NLP/tmdb-movie-info.csv.gz", 
            # Combined Movie Reviews with financial data
            "reviews-with-movie-info_json": "Data-NLP/combined-tmdb-movie-reviews-with-info.json", 
            # "movie-reviews-with-info_json":
            # ...??
            # "eda": "Data-NLP/eda-movie-reviews.csv.gz",
            "processed-reviews-spacy_joblib": "Data-NLP/processed-nlp-reviews.joblib",
            "processed-reviews-spacy_json": "Data-NLP/processed-nlp-reviews.json",
        },
        "app": {
            # Version of tabular data to load into streamlit app (unless decide to do AWS)
            "movie-data_csv": "app-assets/movie-data-streamlit.csv", 
            # "movie-reviews": ???:
                # Same as raw>movie-reviews-with-info-json?
                # or Same as ml>'reviews-with-target-json'?? 
        },
        "ml-nlp": {
            # Final dataframe of machine learning for NLP ( Use Data-NLP Folder for these models)
            "reviews-with-target_json": "Data-NLP/modeling/processed-nlp-reviews-for-ml.json",
            "train_joblib": "Data-NLP/modeling/training-data.joblib",  # (X_train,y_train)
            "test_joblib": "Data-NLP/modeling/testing-data.joblib",  # (X_test,y_test)
        },
        "ml-tabular": {
            # Final dataframe of machine learning ( Use Data Folder for these models)
            "movie-info-with-ml-target_json": "Data/modeling/processed-movie-data-for-ml.json", # "Data-NLP/modeling/processed-nlp-reviews-for-ml.json",
            "train_joblib": "Data/modeling/training-data.joblib",  # (X_train,y_train)
            "test_joblib": "Data/modeling/testing-data.joblib",  # (X_test,y_test)
        },
        "nn": {
            "train_dir": "Data/modeling/training-data-tf/",  # train_ds
            "test_dir": "Data/modeling/testing-data-tf/",  # test_ds
        },
    },
    images={
        "banner": "images/app-banner.png",
    },
    # # Additional metadata (target lookup,etc.)
    metadata={
        "target_lookup": "Data-NLP/target-lookup.json",
    },
    # Any images to be displayed in the app
    eda={
        "wordclouds-by-roi_png": "images/wordclouds-compare-roi.png",
        "wordclouds-by-rating_png": "images/wordclouds-compare-rating.png",
        "scattertext-by-roi_html": "app-assets/scattertext-roi.html",
        "scattertext-by-rating_html": "app-assets/scattertext-rating.html",

    },
    models={
        # Machine Learning Models and results
        "ml": {
            "bayes_joblib": "Models/bayes-clf.joblib",
            "random_forest_joblib": "Models/random-forest.joblib",
            "logreg_joblib": "Models/log-reg.joblib",
        },
        # Neural networks and results
        "nn": {
            "LSTM_dir": "Models/keras/lstm/",
            "GRU_dir": "Models/keras/gru/",
            "Attention_model_dir": "Models/keras/attn/",
        },
    },
)
# Use fn for local package, ds for pip version
fn.utils.create_directories_from_paths(FPATHS)
# ds.utils.create_directories_from_paths(FPATHS)

print('[i] FPATHS Dictionary:\n')
pprint(FPATHS)

## Save the filepaths
with open(FPATHS_FILE, "w") as f:
    json.dump(FPATHS, f)
    print(f"\n[i] Saved FPATHS to {FPATHS_FILE}")

[i] FPATHS Dictionary:

{'data': {'app': {'movie-data_csv': 'app-assets/movie-data-streamlit.csv'},
          'ml-nlp': {'reviews-with-target_json': 'Data-NLP/modeling/processed-nlp-reviews-for-ml.json',
                     'test_joblib': 'Data-NLP/modeling/testing-data.joblib',
                     'train_joblib': 'Data-NLP/modeling/training-data.joblib'},
          'ml-tabular': {'movie-info-with-ml-target_json': 'Data/modeling/processed-movie-data-for-ml.json',
                         'test_joblib': 'Data/modeling/testing-data.joblib',
                         'train_joblib': 'Data/modeling/training-data.joblib'},
          'nn': {'test_dataset': 'Data/modeling/testing-data-tf/',
                 'train_dataset': 'Data/modeling/training-data-tf/'},
          'raw': {'movie-info-tmdb_csv': 'Data-NLP/tmdb-movie-info.csv.gz',
                  'movie-reviews-tmdb_csv': 'Data-NLP/tmdb-movie-reviews.csv.gz',
                  'processed-reviews-spacy_joblib': 'Data-NLP/processed-nlp-re

In [22]:
with open(FPATHS_FILE) as f:
    TEST = json.load(f)
# pprint(TEST)
TEST

{'data': {'raw': {'movie-reviews-tmdb_csv': 'Data-NLP/tmdb-movie-reviews.csv.gz',
   'movie-info-tmdb_csv': 'Data-NLP/tmdb-movie-info.csv.gz',
   'reviews-with-movie-info_json': 'Data-NLP/combined-tmdb-movie-reviews-with-info.json',
   'processed-reviews-spacy_joblib': 'Data-NLP/processed-nlp-reviews.joblib',
   'processed-reviews-spacy_json': 'Data-NLP/processed-nlp-reviews.json'},
  'app': {'movie-data_csv': 'app-assets/movie-data-streamlit.csv'},
  'ml-nlp': {'reviews-with-target_json': 'Data-NLP/modeling/processed-nlp-reviews-for-ml.json',
   'train_joblib': 'Data-NLP/modeling/training-data.joblib',
   'test_joblib': 'Data-NLP/modeling/testing-data.joblib'},
  'ml-tabular': {'movie-info-with-ml-target_json': 'Data/modeling/processed-movie-data-for-ml.json',
   'train_joblib': 'Data/modeling/training-data.joblib',
   'test_joblib': 'Data/modeling/testing-data.joblib'},
  'nn': {'train_dataset': 'Data/modeling/training-data-tf/',
   'test_dataset': 'Data/modeling/testing-data-tf/'}},

In [23]:
TEST['data'].keys()

dict_keys(['raw', 'app', 'ml-nlp', 'ml-tabular', 'nn'])

## Combining and Cleaning TMDB API Data 
**(Also Done @ End of API Calls in Notebook 2A))**