In [29]:
import pandas as pd
from collections import Counter

In [30]:
train = pd.read_pickle('./movie+cast+directors.pkl')

## Genres

In [31]:
list_of_genres = list(train['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)

In [32]:
train['num_genres'] = train['genres'].apply(lambda x: len(x) if x != {} else 0)
train['all_genres'] = train['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_genres = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(15)]
for g in top_genres:
    train['genre_' + g] = train['all_genres'].apply(lambda x: 1 if g in x else 0)

In [33]:
train = train.drop(['genres'], axis=1)

## Languages

In [34]:
list_of_languages = list(train['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
Counter([i for j in list_of_languages for i in j]).most_common(15)

[('English', 5646),
 ('Français', 713),
 ('Español', 616),
 ('Deutsch', 359),
 ('Italiano', 337),
 ('Pусский', 255),
 ('日本語', 215),
 ('普通话', 162),
 ('العربية', 120),
 ('', 110),
 ('Latin', 100),
 ('广州话 / 廣州話', 83),
 ('Português', 81),
 ('한국어/조선말', 74),
 ('svenska', 47)]

In [35]:
train['num_languages'] = train['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
train['all_languages'] = train['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
top_languages = [m[0] for m in Counter([i for j in list_of_languages for i in j]).most_common(30)]
for g in top_languages:
    train['language_' + g] = train['all_languages'].apply(lambda x: 1 if g in x else 0)
train = train.drop(['spoken_languages', 'all_languages'], axis=1)

## Collections 

if collection == True, the name of the collection is stored in `collection` columns.
Otherwise, `None` is stored

In [36]:
train['collection'] = train['belongs_to_collection'].apply(lambda x: None if x == None else x['name'])

In [37]:
train = train.drop(['belongs_to_collection'], axis =1)

In [38]:
train = train.drop(['index'], axis=1)

## Homepage

if the movie has homepage, True, otherwise false

In [39]:
train['has_homepage'] = train['homepage'].apply(lambda x: False if x == '' else True)

In [40]:
train = train.drop(['homepage','id', 'imdb_id'], axis=1)

We drop the poster_path column from data frame.

In [41]:
train = train.drop(['poster_path'],axis=1)

## Production

`production` column stores the list of the names of the production companies.

`production_countries` column stores the list of the names of the production countries.

In [42]:
train['production'] = train['production_companies'].apply(lambda x: [i['name'] for i in x])

In [43]:
train['production_countries'] = train['production_countries'].apply(lambda x : [i['iso_3166_1'] for i in x])

In [44]:
train = train.drop(['production_companies'], axis=1)

## Normalize release date

`dates` column stores the normalized value between [-1,1] of the release date

In [45]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import time

def convert_to_timestamp(x):
    """Convert date objects to integers"""
    return time.mktime(x.timetuple())

In [46]:
train['dates'] = pd.to_datetime(train['release_date'])
train['dates'] = train['dates'].apply(convert_to_timestamp) 
## This converts the release_dates to timetuple (integer valued)

In [47]:
max = train['dates'].max()
min = train['dates'].min()

train['dates'] = (train['dates'] - min) *( 1 - (-1))/ (max -min) + (-1)
## Affine transform from [min, max] to [-1,1]

In [48]:
train.to_pickle('movie_clean.pkl')