# EDA

In [1]:
import pandas as pd
import numpy as np
import sweetviz
from sklearn.model_selection import train_test_split
from flaml import AutoML

# titanic data

In [2]:
path = '../../data/titanic.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Sweetviz

In [3]:
# # analyze
# eda_report = sweetviz.analyze([df, 'Data'], target_feat='Survived')

Done! Use 'show' commands to display/save.   |██████████| [100%]   00:00 -> (00:00 left)


In [4]:
# # show html
# eda_report.show_html("eda_report1.html")

Report eda_report1.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Train/test split

In [5]:
train, test = train_test_split(df, test_size = 0.2, random_state =42)

In [6]:
# ## compare train, test using sweetviz
# compare_report = sweetviz.compare([train, 'Train'], [test, 'Test'], 'Survived')
# compare_report.show_html("compare_report.html")

In [7]:
# # comparing sub-populations
# compare_sex = sweetviz.compare_intra(df, df["Sex"]== "male", ["Male", "Female"], 'Survived')
# compare_sex.show_html("compare_sex.html")

# Baseline model

In [8]:
# X, y split
X = df.drop('Survived', axis=1)
y = df['Survived']

In [9]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from flaml import AutoML
automl = AutoML()

In [11]:
# 모델 튜닝하기
settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": 'rmse',  # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
                           # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
    "task": 'classification',  # task type    
    "log_file_name": 'titanic.log',  # flaml log file
    "seed": 421266,    # random seed
}

# 파라미터 참고 링크
# https://microsoft.github.io/FLAML/

In [12]:
automl.fit(X_train=X_train, y_train=y_train, **settings)

[flaml.automl: 11-12 21:23:27] {1485} INFO - Data split method: stratified
[flaml.automl: 11-12 21:23:27] {1489} INFO - Evaluation method: cv
[flaml.automl: 11-12 21:23:27] {1540} INFO - Minimizing error metric: rmse
[flaml.automl: 11-12 21:23:27] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 11-12 21:23:27] {1826} INFO - iteration 0, current learner lgbm
[flaml.automl: 11-12 21:23:27] {1943} INFO - Estimated sufficient time budget=1407s. Estimated necessary time budget=23s.
[flaml.automl: 11-12 21:23:27] {2023} INFO -  at 0.2s,	estimator lgbm's best error=0.4557,	best estimator lgbm's best error=0.4557
[flaml.automl: 11-12 21:23:27] {1826} INFO - iteration 1, current learner lgbm
[flaml.automl: 11-12 21:23:27] {2023} INFO -  at 0.3s,	estimator lgbm's best error=0.4557,	best estimator lgbm's best error=0.4557
[flaml.automl: 11-12 21:23:27] {1826} INFO - iteration 2, current learner lgbm
[flaml.automl: 11-12 21:23:27] {20

# Flaml 결과

In [13]:
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 7, 'num_leaves': 5, 'min_child_samples': 21, 'learning_rate': 0.32556254798353695, 'log_max_bin': 8, 'colsample_bytree': 1.0, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.4786212688867904}
Best accuracy on validation data: 0.5974
Training duration of best run: 0.009416 s


In [14]:
automl.model.estimator

LGBMClassifier(learning_rate=0.32556254798353695, max_bin=255,
               min_child_samples=21, n_estimators=7, num_leaves=5,
               reg_alpha=0.0009765625, reg_lambda=1.4786212688867904,
               verbose=-1)

In [15]:
''' compute predictions of testing dataset ''' 
y_pred = automl.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
y_pred_proba = automl.predict_proba(X_test)[:,1]


Predicted labels [0 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 1 1 0 1
 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 1 1]
True labels 709    1
439    0
840    0
720    1
39     1
      ..
433    0
773    0
25     1
84     1
10     1
Name: Survived, Length: 179, dtype: int64


In [16]:
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))


accuracy = 0.8100558659217877
roc_auc = 0.8755469755469756
log_loss = 0.44168372830689157
