In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import ast
from datetime import datetime

In [2]:
#Read in csv file
train = pd.read_csv('Datasets/train.csv')
test = pd.read_csv('Datasets/test.csv')

In [3]:
#Dropping columns with a lot of missing values and won't be used in analysis
train = train.drop(columns = ['id', 'homepage', 'imdb_id', 'tagline', 'overview', 'crew', 'status','original_language', 'poster_path', 'title'])

In [4]:
test = test.drop(columns = ['id', 'homepage', 'imdb_id', 'tagline', 'overview','crew','status','original_language', 'poster_path','title'])

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  604 non-null    object 
 1   budget                 3000 non-null   int64  
 2   genres                 2993 non-null   object 
 3   original_title         3000 non-null   object 
 4   popularity             3000 non-null   float64
 5   production_companies   2844 non-null   object 
 6   production_countries   2945 non-null   object 
 7   release_date           3000 non-null   object 
 8   runtime                2998 non-null   float64
 9   spoken_languages       2980 non-null   object 
 10  Keywords               2724 non-null   object 
 11  cast                   2987 non-null   object 
 12  revenue                3000 non-null   int64  
dtypes: float64(2), int64(2), object(9)
memory usage: 304.8+ KB


### Dropping rows with misssing values

Dropping rows if the total missing value is ~10% of the total data.

In [6]:
# Dropping rows if the total missing value is ~10% of the total data.
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
train = train.dropna(subset = ['Keywords', 'spoken_languages', 'production_countries', 'production_companies',  'genres'])

In [7]:
test = test.dropna(subset = ['Keywords', 'spoken_languages', 'production_countries', 'production_companies',  'genres'])

### belongs_to_collection -- binarizing

In [8]:
train['belongs_to_collection'] = train['belongs_to_collection']. notna().astype(int)

In [9]:
test['belongs_to_collection'] = test['belongs_to_collection']. notna().astype(int)

## Converting list of dictionaries which are written as strings, into actual lists

### Temporarily dropping 'cast' and 'keywords', 'production_companies' because we are on time crunch. If we have time, we can do it.

In [10]:
test = test.drop(columns = ['cast','Keywords','production_companies'])
train = train.drop(columns = ['cast','Keywords','production_companies'])

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2623 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  2623 non-null   int32  
 1   budget                 2623 non-null   int64  
 2   genres                 2623 non-null   object 
 3   original_title         2623 non-null   object 
 4   popularity             2623 non-null   float64
 5   production_countries   2623 non-null   object 
 6   release_date           2623 non-null   object 
 7   runtime                2622 non-null   float64
 8   spoken_languages       2623 non-null   object 
 9   revenue                2623 non-null   int64  
dtypes: float64(2), int32(1), int64(2), object(5)
memory usage: 215.2+ KB


In [12]:
big_dict = {'genres': 'name', 
 #'production_companies': 'name', 
 'production_countries': 'name',
 'spoken_languages': 'iso_639_1' # 'name' may be used... compare iso to original language, if same we drop
 # 'Keywords': 'name' 
 #'cast': ['gender','order'], # take the 'gender' from 'order' = 0 ******************
}

##### This is a function for the loop below it

In [13]:
def feature_extractor(l, key_we_want):
    # Input: l is the list we want to extract from.
    #        key_we_want is the key we want lol
    
    return [d[key_we_want] for d in l]

#### This is where we tutrn those crazy columns into things we can work with

In [14]:
for col_name, info_we_want in big_dict.items():
    
    
    train[col_name] = train[col_name].apply(ast.literal_eval)
    test[col_name] = test[col_name].apply(ast.literal_eval)
    
    train[col_name] = train[col_name].apply(lambda x: feature_extractor(x, info_we_want))
    test[col_name] = test[col_name].apply(lambda x: feature_extractor(x, info_we_want))
    
    
    

### ok turn releasedate into unix

In [15]:
train['release_date'] = train['release_date'].apply(lambda x: int(datetime.strptime(x, "%m/%d/%y").timestamp()))
test['release_date'] = test['release_date'].apply(lambda x: int(datetime.strptime(x, "%m/%d/%y").timestamp()))

## dummifying the categorical columns

Will not be dropping first in the following three categorical values because they are all multicategory(not mutually exclusive)

In [16]:
combined_df = pd.concat([train,test],axis=0,ignore_index=True)

genres

In [17]:
genres_dummies = pd.get_dummies(combined_df['genres'].explode(), prefix='genre')

genres_dummies = genres_dummies.groupby(genres_dummies.index).sum()

combined_df = pd.concat([combined_df,genres_dummies], axis=1)

combined_df.drop('genres', axis=1, inplace=True)

production_company

In [18]:
production_country_dummies = pd.get_dummies(combined_df['production_countries'].explode(), prefix='prod_country')

production_country_dummies = production_country_dummies.groupby(production_country_dummies.index).sum()

combined_df = pd.concat([combined_df,production_country_dummies], axis=1)

combined_df.drop('production_countries', axis=1, inplace=True)

spoken language

In [19]:
spoken_language_dummies = pd.get_dummies(combined_df['spoken_languages'].explode(), prefix='spoken_lang')

spoken_language_dummies = spoken_language_dummies.groupby(spoken_language_dummies.index).sum()

combined_df = pd.concat([combined_df,spoken_language_dummies], axis=1)

combined_df.drop('spoken_languages', axis=1, inplace=True)

In [20]:
combined_df.shape

(6457, 216)

# Making a length of title column

In [22]:
combined_df['length_of_title'] = combined_df['original_title'].str.len()


In [23]:
combined_df.drop(columns=['original_title'], inplace=True)

Ok splitting the daatafram back into train and test

In [24]:
object_cols = list(combined_df.select_dtypes('object').columns.values)
object_cols

['release_date']

In [25]:
numeric_cols = list(combined_df._get_numeric_data().columns)

In [26]:
len(numeric_cols)

215

In [27]:
train_size = len(train)
train_dummified = combined_df[:train_size]
test_dummified = combined_df[train_size:]

In [29]:
test_dummified = test_dummified.drop(columns=['revenue'])

In [30]:
test_dummified.shape

(3834, 215)

In [31]:
train_dummified.shape

(2623, 216)