In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

# Load packages

In [21]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

import ast
from tqdm import tqdm, tqdm_notebook
from datetime import datetime

import os

#plt.style.use("dark_background")

random_seed = 2019


# Load data  

In [22]:
from utils import load_data

movies_df, cci_df, cpi_df, gdp_df, rir_df, uer_df, eacc_df, rotwcc_df = load_data()

# Data Processing

In [23]:
from utils import clean_data, date_features, eval_dict_columns

movies_df = clean_data(movies_df)
movies_df = date_features(movies_df)
movies_df = eval_dict_columns(movies_df)

# Missing Values

In [24]:
movies_df = movies_df[movies_df.year>=1961]
budget_median_in_buckets = movies_df.groupby(['year']).budget.transform('median')
movies_df.loc[movies_df['budget']==0, 'budget'] = None
movies_df.budget = movies_df.budget.fillna(budget_median_in_buckets)

# Scores per Cast

In [25]:
from utils import cast_crew_features

movies_df = cast_crew_features(movies_df)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3116 [00:00<?, ?it/s]

  0%|          | 0/3116 [00:00<?, ?it/s]

  0%|          | 0/3116 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/8416 [00:00<?, ?it/s]

  0%|          | 0/8416 [00:00<?, ?it/s]

  0%|          | 0/8416 [00:00<?, ?it/s]

# Categorical Cut

In [26]:
from utils import categorical_cut

cutoff_thresholds = {
    'genres': 0,
    'production_companies': 30,
    'production_countries': 0,
    'spoken_languages': 100,
    'Keywords': 30,
    'cast': 0,
    'crew': 0,
}


movies_df, results_summary_df = categorical_cut(movies_df, cutoff_thresholds)

In [27]:
results_summary_df

Unnamed: 0_level_0,before,after
name,Unnamed: 1_level_1,Unnamed: 2_level_1
genres,20,20
production_companies,3640,20
production_countries,74,74
spoken_languages,56,6
Keywords,7191,49
cast,36692,36692
crew,37591,37591


In [28]:
from utils import prepare_additional_features
movies_df = prepare_additional_features(movies_df)

# Macroeconomics

In [29]:
from utils import join_macroeconomics

movies_df = join_macroeconomics(movies_df, cci_df, cpi_df, gdp_df, rir_df, uer_df, eacc_df, rotwcc_df)

# Check Nans

In [34]:
nan_count_df = movies_df.isna().sum()/movies_df.shape[0]
nan_count_df[nan_count_df>0.00]

Series([], dtype: float64)

In [31]:
df_median_in_buckets = movies_df.groupby(['year']).transform('median')
movies_df.fillna(df_median_in_buckets, inplace=True)

In [32]:
nan_count_df = movies_df.isna().sum()/movies_df.shape[0]
nan_count_df[nan_count_df>0.00]

Series([], dtype: float64)

# Outliers

In [33]:
movies_df['roi'] = (movies_df.revenue-movies_df.budget)/movies_df.budget
movies_df = movies_df[movies_df.roi<150]

# Drop columns

In [35]:
from utils import drop_columns

movies_df = drop_columns(movies_df)

# Save data

In [36]:
movies_df = movies_df.drop(columns=['id'], axis=1)
movies_df.to_csv('data/processed_data.csv')

In [17]:
movies_df.columns

Index(['belongs_to_collection', 'budget', 'production_countries', 'runtime',
       'revenue', 'year', 'month', 'day', 'dayofweek', 'quarter',
       ...
       'isOriginalLanguageEng', 'production_countries_count',
       'production_companies_count', 'country_category', 'cci', 'cpi', 'gdp',
       'rir', 'uer', 'roi'],
      dtype='object', length=229)