In [None]:
# import packages
import pandas as pd
import numpy as np
from datetime import datetime

import ast

In [None]:
# read the datasets
orginal_data_path = 'original_data/'

movies_metadata = pd.read_csv(orginal_data_path + 'movies_metadata.csv')
ratings = pd.read_csv(orginal_data_path + 'ratings.csv')
links = pd.read_csv(orginal_data_path + 'links.csv')
keywords = pd.read_csv(orginal_data_path + 'keywords.csv')
credits = pd.read_csv(orginal_data_path + 'credits.csv')

ratings['date_time'] = pd.to_datetime(ratings['timestamp'], unit='s')

# movies_metadata

In [None]:
# general data type cleaning
movies_metadata = movies_metadata[movies_metadata['adult'].isin(['True', 'False'])]
movies_metadata['id'] = movies_metadata['id'].astype(int)
movies_metadata['popularity'] = movies_metadata['popularity'].astype(float)
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

movies_metadata['adult'] = movies_metadata['adult'].map({'True': True, 'False': False})

movies_metadata['budget'] = movies_metadata['budget'].astype(int)
for col in ['budget', 'revenue', 'vote_average', 'vote_count']:
    movies_metadata[col] = movies_metadata[col].replace(0, None)

In [None]:
def get_list_dict(original_list, name, type = 'list'):
    if type == 'list':
        try:
            final_list = []
            for item in ast.literal_eval(original_list):
                final_list.append(item[name])
            return final_list
        except:
            pass
    elif type == 'dict':
        try:
            return ast.literal_eval(original_list)[name]
        except:
            pass 

In [None]:
def count_times(column_list, re = False):
    if isinstance(column_list[0], list):
        count_dict = dict()
        for cell_list in column_list:
            try:
                for item in cell_list:
                    if item not in count_dict.keys():
                        count_dict[item] = 1
                    else:
                        count_dict[item] += 1
            except:
                pass
        final = len(count_dict)        
        
    else:
        final = len(column_list.unique())
    
    
    print(f"Total unique {column_list.name}: {final}")

    if re == True:
        return count_dict

In [None]:
# # print 100-time language
# onehundred_time_langs_list = []

# for lang in lang_count_dict.keys():
#     if lang_count_dict[lang] <= 100:
#         onehundred_time_langs_list.append(lang)

# print(onehundred_time_langs_list)

## Spoken Languages

In [None]:
movies_metadata['lang_code_list'] = movies_metadata['spoken_languages'].apply(lambda x: get_list_dict(x, 'iso_639_1'))

count_times(movies_metadata['lang_code_list'])

## Production Companies

In [None]:
movies_metadata['pro_comp_list'] = movies_metadata['production_companies'].apply(lambda x: get_list_dict(x, 'name'))

count_times(movies_metadata['pro_comp_list'])

## Production Countries

In [None]:
movies_metadata['pro_coun_list'] = movies_metadata['production_countries'].apply(lambda x: get_list_dict(x, 'iso_3166_1'))

count_times(movies_metadata['pro_coun_list'])

## Belongs to Collection

In [None]:
movies_metadata['collection'] = movies_metadata['belongs_to_collection'].apply(lambda x: get_list_dict(x, 'id', 'dict'))

count_times(movies_metadata['collection'])

## Genre

In [None]:
movies_metadata['genre_list'] = movies_metadata['genres'].apply(lambda x: get_list_dict(x, 'name'))

all_genres = count_times(movies_metadata['genre_list'], re = True)

In [None]:
# print one-time genre
onetime_genres = []

for genre in all_genres.keys():
    if all_genres[genre] == 1:
        onetime_genres.append(genre)
  
print(onetime_genres)

In [None]:
# get the dummy variables for genres
for genre in all_genres:
    movies_metadata[f"genre_{genre}"] = movies_metadata['genre_list'].apply(
        lambda x: int(genre in x)
    )

# remove the one-time genre dummy variable
movies_metadata = movies_metadata.drop([f"genre_{genre}" for genre in onetime_genres], axis = 1)

## Save as .csv File

In [None]:
drop_list = ['spoken_languages', 'production_companies', 'production_countries', 'belongs_to_collection', 'genres']
movies_metadata = movies_metadata.drop(drop_list, axis = 1)

In [None]:
movies_metadata.to_csv('movies_cleaned.csv', index=False)