<a href="https://colab.research.google.com/github/hnishi/hello-automl/blob/main/autogluon_titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://auto.gluon.ai/stable/install.html
!pip install -q autogluon.tabular[all]

[K     |████████████████████████████████| 250 kB 5.3 MB/s 
[K     |████████████████████████████████| 22.3 MB 1.4 MB/s 
[K     |████████████████████████████████| 296 kB 68.0 MB/s 
[K     |████████████████████████████████| 334 kB 65.5 MB/s 
[K     |████████████████████████████████| 48 kB 4.4 MB/s 
[K     |████████████████████████████████| 27.4 MB 97 kB/s 
[K     |████████████████████████████████| 67.3 MB 4.0 kB/s 
[K     |████████████████████████████████| 157.5 MB 52 kB/s 
[K     |████████████████████████████████| 2.0 MB 49.1 MB/s 
[K     |████████████████████████████████| 188 kB 37.5 MB/s 
[K     |████████████████████████████████| 81 kB 7.9 MB/s 
[K     |████████████████████████████████| 769 kB 41.1 MB/s 
[K     |████████████████████████████████| 102 kB 63.5 MB/s 
[K     |████████████████████████████████| 206 kB 63.3 MB/s 
[K     |████████████████████████████████| 950 kB 61.8 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build 

In [None]:
%matplotlib inline

import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import sklearn.ensemble
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import plot_partial_dependence, permutation_importance

# https://auto.gluon.ai/stable/index.html
import autogluon.tabular
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
!pip show autogluon.tabular

Name: autogluon.tabular
Version: 0.2.0
Summary: AutoML for Text, Image, and Tabular Data
Home-page: https://github.com/awslabs/autogluon
Author: AutoGluon Community
Author-email: None
License: Apache-2.0
Location: /usr/local/lib/python3.7/dist-packages
Requires: pandas, autogluon.core, networkx, numpy, psutil, autogluon.features, scipy, pytest, scikit-learn
Required-by: 


## Data Loading and Preprocess



In [None]:
# Using Titanic dataset https://www.openml.org/d/40945.
# This example will use the command fetch_openml, which will
# download a properly formatted dataframe if you use as_frame=True.
X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)

In [None]:
# remove unused columns
X.drop(['name', 'cabin', 'ticket', 'body', 'home.dest', 'boat'], inplace=True, axis=1)

X["pclass"] = X["pclass"].astype("int")
X["sibsp"] = X["sibsp"].astype("int")
X["parch"] = X["parch"].astype("int")        
X["age"] = X["age"].astype("float")        
X["fare"] = X["fare"].astype("float")        
X["embarked"] = X["embarked"].astype("category")
X["sex"] = X["sex"].astype("category")
y = y.astype("category")

In [None]:
# check missing values
for column in X.columns:
    print(f"{column}: {str(sum(X[column].isnull()))} missing values")

pclass: 0 missing values
sex: 0 missing values
age: 263 missing values
sibsp: 0 missing values
parch: 0 missing values
fare: 1 missing values
embarked: 2 missing values


In [None]:
X.dtypes

pclass         int64
sex         category
age          float64
sibsp          int64
parch          int64
fare         float64
embarked    category
dtype: object

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, test_size=0.3, random_state=1
)

## Baseline

In [None]:
# Gender model
# It will predict that all women survive and all men do not survive.
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, X_test["sex"].map(lambda x: '1' if x == "female" else '0')))

Accuracy score 0.7888040712468194


## Build and Fit a Classifier



In [None]:
# prepare input data
train = X_train.copy()
train['survive'] = y_train

test = X_test.copy()
test["survive"] = y_test

In [None]:
%%time

# https://auto.gluon.ai/dev/api/autogluon.task.html#module-0
model = TabularPredictor(
    label='survive',
    problem_type='binary',
    eval_metric='accuracy',  # 'accuracy' is default for binary/multiclass classification
)
# https://auto.gluon.ai/dev/api/autogluon.task.html#autogluon.tabular.TabularPredictor.fit
model.fit(
    train,
    time_limit=120,  # in sec
    tuning_data=None,  # Do not provide your evaluation test data here!
    ag_args_fit={"num_cpu": 1}
) 

No path specified. Models will be saved in: "AutogluonModels/ag-20210806_144622/"
Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "AutogluonModels/ag-20210806_144622/"
AutoGluon Version:  0.2.0
Train Data Rows:    916
Train Data Columns: 7
Preprocessing data ...
NumExpr defaulting to 2 threads.
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
	Note: For your binary classification, AutoGluon arbitrarily selected which label-value represents positive (1) vs negative (0) class.
	To explicitly set the positive_class, either rename classes to 1 and 0, or specify positive_class in Predictor init.
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    12745.89 MB
	Train Data (Original)  Memory Usage: 0.04 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	St

CPU times: user 9.76 s, sys: 1.09 s, total: 10.8 s
Wall time: 14 s


In [None]:
y_train_pred = model.predict(X_train)
print("Accuracy score", sklearn.metrics.accuracy_score(y_train, y_train_pred))

Accuracy score 0.8831877729257642


In [None]:
# the model name of the best model by validation score
print(model.get_model_best())

WeightedEnsemble_L2


In [None]:
# evaluate all of the models AutoGluon has previously trained on our test data
model.leaderboard(test, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.806616,0.86413,0.292077,0.250349,2.104567,0.006339,0.002169,0.471375,2,True,13
1,LightGBM,0.793893,0.847826,0.011366,0.010176,0.242149,0.011366,0.010176,0.242149,1,True,4
2,XGBoost,0.788804,0.858696,0.044463,0.00767,0.217703,0.044463,0.00767,0.217703,1,True,11
3,ExtraTreesGini,0.783715,0.766304,0.226202,0.108245,0.773222,0.226202,0.108245,0.773222,1,True,8
4,ExtraTreesEntr,0.783715,0.771739,0.230779,0.107245,0.759059,0.230779,0.107245,0.759059,1,True,9
5,LightGBMLarge,0.78117,0.826087,0.0121,0.009993,0.418242,0.0121,0.009993,0.418242,1,True,12
6,CatBoost,0.778626,0.826087,0.008013,0.005006,0.439785,0.008013,0.005006,0.439785,1,True,7
7,LightGBMXT,0.776081,0.831522,0.009614,0.009683,0.543886,0.009614,0.009683,0.543886,1,True,3
8,RandomForestEntr,0.763359,0.798913,0.223315,0.107516,0.847117,0.223315,0.107516,0.847117,1,True,6
9,RandomForestGini,0.755725,0.788043,0.12062,0.108225,0.750338,0.12062,0.108225,0.750338,1,True,5


## Predict and Evaluate the Final Ensemble

In [None]:
y_pred = model.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_pred))

Accuracy score 0.806615776081425


## Model Explanation

Ref: https://auto.gluon.ai/dev/tutorials/tabular_prediction/tabular-indepth.html#interpretability-feature-importance

In [None]:
# Interpretability (feature importance)
model.feature_importance(test)

Computing feature importance via permutation shuffling for 7 features using 393 rows with 3 shuffle sets...
	7.11s	= Expected runtime (2.37s per shuffle set)
	1.35s	= Actual runtime (Completed 3 of 3 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
sex,0.167091,0.010594,0.000669,3,0.227794,0.106388
pclass,0.051739,0.005297,0.001738,3,0.08209,0.021387
age,0.045802,0.009174,0.006556,3,0.098372,-0.006769
embarked,0.022901,0.004407,0.006061,3,0.048155,-0.002353
fare,0.022053,0.007774,0.019506,3,0.066597,-0.022491
sibsp,0.01866,0.003887,0.007078,3,0.040932,-0.003612
parch,0.001696,0.003887,0.264298,3,0.023968,-0.020576
