In [1]:
!pip install category_encoders



In [2]:
# Import librairies.
from src.utils import (  # local functions.
    extract_only_names, 
    select_first_element
)  
import numpy as np
import pandas as pd
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, learning_curve, validation_curve

# Modélisation

## Importation des données

In [3]:
# Load the movie data into a pandas DataFrame.
movies_df = pd.read_pickle("./data/movies_tmdb.pkl")
print("Initial dimension :", movies_df.shape)

# Keep only non-zero budget and non-zero revenue.
movies_df = movies_df.query('budget > 0 and revenue > 0')
print("Dimension after restriction:", movies_df.shape)

movies_df.head(1)

Initial dimension : (10000, 25)
Dimension after restriction: (4974, 25)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/Yc9q6QuWrMp9nuDm5R8ExNqbEq.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,en,Avatar,...,2009-12-15,2920357254,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.542,27004


## Pré-traitement des données

In [4]:
# List of columns to which apply the function. 
colnames = ["genres", "production_countries", "production_companies"]

# Apply cleaning functions.
for colname in colnames:
    try:
        # Extract only names.
        movies_df[colname] = movies_df[colname].apply(extract_only_names)
        # Replace empty lists with numpy.nan
        movies_df[colname] = movies_df[colname].apply(lambda x: np.nan if not x else x)
    except:
        pass

movies_df[colnames].head(2)

Unnamed: 0,genres,production_countries,production_companies
0,"[Action, Adventure, Fantasy, Science Fiction]","[United States of America, United Kingdom]","[20th Century Fox, Ingenious Media, Dune Enter..."
3,"[Family, Comedy, Fantasy]","[Germany, United States of America]","[Imagine Entertainment, Universal Pictures, LU..."


In [5]:
# Replace empty lists with numpy.nan / Sanity Check
df = movies_df['genres'].apply(lambda x: np.nan if not x else x)

for id, genre in enumerate(df):
    try:
        genre[0]
    except:
        print(id, genre)
        if genre is np.nan:
            print("OK")

2728 nan
OK


In [6]:
# Subset columns.
columns = [
    "release_date", "budget", "genres", "popularity", 
    "production_companies", "production_countries", "runtime", 
    "vote_average", "vote_count", "revenue", 
]
movies_df = movies_df[columns]
movies_df.head(1)

Unnamed: 0,release_date,budget,genres,popularity,production_companies,production_countries,runtime,tagline,vote_average,vote_count,revenue
0,2009-12-15,237000000,"[Action, Adventure, Fantasy, Science Fiction]",3424.983,"[20th Century Fox, Ingenious Media, Dune Enter...","[United States of America, United Kingdom]",162,Enter the world of Pandora.,7.542,27004,2920357254


* [ ] **`genres`, `production_countries`, `production_companies`** 

**Extracting the primary genre / production countries / production companies**: At a first glance, we are going to considered only the primary genre, production and countries of each movie, so that we extract the first genre in the list and use that as a categorical feature. We will then use one-hot encoding or label encoding to represent the primary genre as a numerical value further in the modeling._

In [7]:
# Select only first element.
for colname in colnames:
    print(colname)
    movies_df[colname] = movies_df[colname].apply(select_first_element)
    print("OK")
    
movies_df[colnames].head(5)

genres
OK
production_countries
OK
production_companies
OK


Unnamed: 0,genres,production_countries,production_companies
0,Action,United States of America,20th Century Fox
3,Family,Germany,Imagine Entertainment
5,Adventure,United Kingdom,Walt Disney Pictures
6,Action,United States of America,DreamWorks Animation
8,Comedy,United States of America,Orion Pictures
...,...,...,...
9986,Mystery,Germany,Epsilon Motion Pictures
9993,Drama,Denmark,Zentropa Entertainments
9994,Action,France,Paradis Films
9996,Drama,Canada,Universal Pictures


In [12]:
# Subset Data.
modelcols = [
    "genres", "popularity", "production_companies", "production_countries", 
    "budget", "runtime", "vote_average", "vote_count", "revenue"
]
movies2_df = movies_df[modelcols]

X shape: (4974, 8)
y shape: (4974,)
X_train shape: (3979, 8)
y_train shape: (3979,)
X_test shape: (995, 8)
y_test shape: (995,)


- [ ] **`Release date`** 

Nous avons réfléchi à plusieurs façons de gérer ces caractéristiques pour notre modélisation. Parmi celles-ci :
1. Utiliser les dates **tel quel**. Cela est intéressant Mais cela impliquerait trop de catégories à inclure et expliquerait la matrice des caractéristiques pour les modèles qui ont besoin d'un encodage catégoriel. 
2. **Regrouper les dates** dans des catégories, telles que "récent", "moyen" et "ancien", en fonction de la date de sortie, puis utiliser ces catégories comme une caractéristique catégorielle dans notre modèle. Nous ne souffrons plus du problème de cardinalité élevée. Néanmoins, il est clair, d'après l'analyse descriptive, que les recettes du box-office ont augmenté au fil des ans et que cette tendance se poursuit.
3. 



## Création de pipelines.

**Pipelines**
- [x] Dealing Missing values
- [x] Handling Outliers
- [x] One Hot Encoding (categorical features)
- [x] Normalization Operations (numerical features)
- [ ] Vectorization? (for features with list as values)

In [None]:
# Split features and target.
target="revenue"
X = movies2_df.drop(columns=target)
y = movies2_df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)

# Train-Test split.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [179]:
# Filter features according to their type. 
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_include=object)

# Create a pipeline of transformation for each type of columns.
## Impute missing by the median and standardize numerical features.
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'), 
    StandardScaler()
)

## Impute missing by the mode and one-hot categorical features.
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'), 
    OneHotEncoder(drop=None, handle_unknown='ignore')
)

# Create transformer to apply pipeline transformation to each groups of columns.
preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_features),
    (categorical_pipeline, categorical_features)
)

**Low- and high-cardinality features**

The next thing we need to look out for are categorical columns with **low or high cardinality**. If there's only one category in a column, it won't provide any unique information to our model. At the other extreme, columns where nearly every row has its own category won't help our model in identifying useful trends in the data.

Let's take a look at the cardinality of our features.

In [28]:
# Unique values.
movies2_df.nunique()

genres                    18
popularity              4589
production_companies     944
production_countries      65
budget                   600
runtime                  150
vote_average            2333
vote_count              2753
revenue                 4703
dtype: int64

There features `production_companies` has too many unique values (944 over 5000 observations). It can increase the dimensionality of the data and make it difficult to model. We are going to group the production companies into bins based on the number of movies they have produced (`binning`). This could reduce the number of unique values and make it easier to model the data.

In [34]:
# Define a function.
def number_values_cumulative(self, column, quantile):
    return (self[column].value_counts(normalize=True).cumsum() < quantile).sum()
# Define a method inside the DataFrame class.
pd.DataFrame.number_values_cumulative = number_values_cumulative

movies2_df.number_values_cumulative("production_companies", 0.50)

In [51]:
pd.DataFrame([
    (q, movies2_df.number_values_cumulative("production_companies", q)) for q in np.linspace(0.1, 1.0, num=10)
], columns=["quantile", "nombre de compagnies"])

Unnamed: 0,quantile,nombre de compagnies
0,0.1,1
1,0.2,2
2,0.3,4
3,0.4,7
4,0.5,12
5,0.6,28
6,0.7,65
7,0.8,167
8,0.9,447
9,1.0,943


In [53]:
movies2_df["production_countries"].unique()

array(['United States of America', 'Germany', 'United Kingdom',
       'New Zealand', 'Japan', 'France', 'Canada', 'Hong Kong', 'Belgium',
       'South Korea', 'Australia', 'Czech Republic', 'China', 'Spain',
       'Mexico', 'Indonesia', 'Bulgaria', 'Italy', 'Luxembourg',
       'Argentina', 'Brazil', 'India', 'Chile', 'United Arab Emirates',
       'Netherlands', 'Norway', 'Thailand', 'Denmark', 'Cambodia',
       'Ireland', 'Russia', 'Hungary', 'Ghana', 'Iceland', 'Austria',
       'South Africa', 'Greece', 'Switzerland', 'Romania', 'Sweden',
       'Finland', 'Slovakia', 'Peru', 'Uruguay', 'Singapore', 'Namibia',
       'Taiwan', None, 'Iran', 'Libyan Arab Jamahiriya', 'Kuwait',
       'Soviet Union', 'Afghanistan', 'Colombia', 'Morocco', 'Estonia',
       'Poland', 'Aruba', 'Ecuador', 'Venezuela', 'Philippines',
       'Belarus', 'Turkey', 'Israel', 'Malaysia', 'Qatar'], dtype=object)

In [48]:
((
    movies2_df.groupby("production_companies")["revenue"]
    .sum().sort_values(ascending=False).cumsum() / movies2_df["revenue"].sum()
) < 0.4).sum()

6

In [29]:
sklearn.preprocessing.OneHotEncoder(handle_unknown="ignore").fit_transform(X, y)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['list']

In [None]:




# Baseline mean square error
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 2))


## Iterate.

# Create a pipeline.
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)

# Fit the model.
model.fit(X_train, y_train)

# Evaluate.



## Testing result using embedding / vectorisation for categorical 

# Communicate Results