# Preprocess TMDB 5000 Movies
This notefile converts the list columns to one-hot-encoded columns.  Preserving only the name member of the json objects in the list.  The id members are not kept, since those are redundant.

In [None]:
import pandas as pd
import ast
import json

DATA_DIR_NAME = '/Users/karenblakemore/merck/data/'
DATA_SET_NAME = 'tmdb_5000_movies'
COLUMNS = ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview',
           'popularity', 'production_companies', 'production_countries', 'spoken_languages',
           'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'vote_average', 'vote_count']

def list_column(value):
    try:
        isinstance(ast.literal_eval(value), list)
        return True
    except:
        return False
    
def dict_to_string(value):
    string_list = []
    for dict_value in json.loads(value):
        string_list.append((dict_value['name'].lower()))
    return(string_list)            

pdf = pd.read_csv(DATA_DIR_NAME + DATA_SET_NAME + '.csv')

pdf_prepped = pd.DataFrame()

for col in COLUMNS:
    print(pdf[col].describe())
    
    if(pdf[col].apply(list_column).all()):
        print('\nLIST COLUMN: ', col, '\n')
        column = pdf[col].apply(dict_to_string)
        
        dummy_columns = pd.get_dummies(column.apply(pd.Series).stack(), \
                                            prefix=col, prefix_sep='-') \
                                .sum(level=0) \
                                .astype('str') \
                                .replace('0', '')
        pdf_prepped = pd.concat([pdf_prepped, dummy_columns], axis=1)
    else:
        pdf_prepped = pd.concat([pdf_prepped, pdf[col]], axis=1)
           
display(pdf_prepped.head())

pdf_prepped = pdf_prepped

pdf_prepped.to_csv(DATA_DIR_NAME + DATA_SET_NAME + '_prepped.csv', index=False)