## Import Module

In [1]:
%load_ext autoreload
%autoreload 2

In [41]:
from sklearn.pipeline import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from python_files.fetch_missing_data import fetch_missing_data
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
from python_files.data import GetData
from python_files.basic_preprocessing import BasicPreprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

## Clean the data

In [28]:
data = GetData().get_data()['AllMoviesDetailsCleaned']
data.head()

Unnamed: 0,id,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,production_companies_number,production_countries_number,spoken_languages_number,budget_new,revenue_new,popularity_new,vote_average_new,vote_count_new,belongs_to_collection_x,belongs_to_collection
0,2,0,Drama|Crime,tt0094675,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,,Villealfa Filmproduction Oy,Finland,...,2,1,2,,,,,,,
1,3,0,Drama|Comedy,tt0092149,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",,Villealfa Filmproduction Oy,Finland,...,1,1,3,,,,,,,
2,5,4000000,Crime|Comedy,tt0113101,en,Four Rooms,It's Ted the Bellhop's first night on the job....,,Miramax Films,United States of America,...,2,1,1,,,,,,,
3,6,0,Action|Thriller|Crime,tt0107286,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",,Universal Pictures,Japan,...,3,2,1,,,,,,,
4,8,42000,Documentary,tt0825671,en,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,,inLoops,Austria,...,1,1,5,,,,,,,


In [29]:
data.shape

(329074, 29)

### Apply basic preprocessing

In [35]:
df = BasicPreprocessing.apply(data)
df.head()

Unnamed: 0,id,budget,genres,original_language,original_title,production_companies,production_countries,release_date,revenue,runtime,...,production_companies_number,production_countries_number,spoken_languages_number,budget_new,revenue_new,popularity_new,vote_average_new,vote_count_new,belongs_to_collection_x,collection
2,5,4000000,Crime|Comedy,en,Four Rooms,Miramax Films,United States of America,25/12/1995,4300000,98.0,...,2,1,1,,,,,,,1
5,9,4000000,Drama,de,Sonntag im August,none,Germany,2/9/2004,4257354,15.0,...,0,1,1,4000000.0,4257354.0,22.475,5.7,2256.0,,1
6,11,11000000,Adventure|Action|Science Fiction,en,Star Wars,Lucasfilm,United States of America,25/05/1977,775398007,121.0,...,2,1,1,,,,,,,1
7,12,94000000,Animation|Family,en,Finding Nemo,Pixar Animation Studios,United States of America,30/05/2003,940335536,100.0,...,1,1,1,,,,,,,1
8,13,55000000,Comedy|Drama|Romance,en,Forrest Gump,Paramount Pictures,United States of America,6/7/1994,677945399,142.0,...,1,1,1,,,,,,,1


In [36]:
df.drop(columns = ['budget_new',
       'revenue_new', 'popularity_new', 'vote_average_new', 'vote_count_new', 'belongs_to_collection_x'], inplace = True)

In [37]:
df.columns

Index(['id', 'budget', 'genres', 'original_language', 'original_title',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'title', 'vote_average',
       'vote_count', 'production_companies_number',
       'production_countries_number', 'spoken_languages_number', 'collection'],
      dtype='object')

In [38]:
df.shape

(8628, 18)

### Impute the remaining missing values

In [39]:
df.isna().sum()

id                               0
budget                           0
genres                         209
original_language                0
original_title                   0
production_companies             0
production_countries             0
release_date                     0
revenue                          0
runtime                         16
spoken_languages                70
title                            0
vote_average                     0
vote_count                       0
production_companies_number      0
production_countries_number      0
spoken_languages_number          0
collection                       0
dtype: int64

Before imputing I need to split the data

### Create `X_basic` that contains only numerical and categorical features.

In [45]:
df.dtypes

id                               int64
budget                           int64
genres                          object
original_language               object
original_title                  object
production_companies            object
production_countries            object
release_date                    object
revenue                          int64
runtime                        float64
spoken_languages                object
title                           object
vote_average                   float64
vote_count                       int64
production_companies_number      int64
production_countries_number      int64
spoken_languages_number          int64
collection                       int64
dtype: object

In [46]:
X_basic = df.drop(columns = ['revenue', 'release_date'])
y = df['revenue']

In [42]:
# Impute and scale numeric data
numeric_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy= 'mean')),
    ('scaler', StandardScaler())])
 
# Convert categorical data to binary
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

In [44]:
numeric_transformer.fit_transform(df[['runtime']])

array([[-1.78265107e-01],
       [-2.80186882e+00],
       [ 5.48757609e-01],
       ...,
       [-4.49200617e-16],
       [ 1.87636431e+00],
       [ 1.06222043e-01]])