In [None]:
#!pip install pycaret chainladder optuna

# Machine Learning in Python

Machine learning


## Python basics

## Scikit-Learn
Estimators and their properties
 * hyperparameters vs fitted parameters_
 * `fit`, `predict`, `transform`, `score` methods

Classification Example - Determine LOB from triangle

SKLEARN
  * Simple train-test split
  * kfold Cross-validation
  * Gridsearch parameter tuning

AutoML (pycaret)


Automated parameter tuning
Optuna


In [4]:
import chainladder as cl
import pandas as pd


In [28]:
clrd = cl.load_sample('clrd')
clrd['CaseIncurLoss'] = clrd['IncurLoss'] - clrd['BulkLoss']


In [29]:
paid_link_ratios = clrd['CumPaidLoss'].link_ratio.mean(axis='origin').to_frame()
inc_link_ratios = clrd['CaseIncurLoss'].link_ratio.mean(axis='origin').to_frame()
paid_pct =(clrd['CumPaidLoss'].cum_to_incr() / clrd['CumPaidLoss']).mean(axis='origin').to_frame()
inc_pct = (clrd['CaseIncurLoss'].cum_to_incr() / clrd['CaseIncurLoss']).mean(axis='origin').to_frame()
paid_to_inc = (clrd['CumPaidLoss'] / clrd['CaseIncurLoss']).mean(axis='origin').to_frame()

In [30]:
paid_link_ratios.columns = [c + ' Paid Link Ratio' for c in paid_link_ratios.columns]
inc_link_ratios.columns = [c + ' Incurred Link Ratio' for c in inc_link_ratios.columns]
paid_pct.columns = [str(c) + ' Paid Percent' for c in paid_pct.columns]
inc_pct.columns = [str(c) + ' Incurred Percent' for c in inc_pct.columns]
paid_to_inc.columns = [str(c) + ' Paid to Incurred' for c in paid_to_inc.columns]

In [48]:
data = pd.concat((paid_link_ratios, inc_link_ratios, paid_pct, inc_pct, paid_to_inc), axis=1).reset_index().drop('GRNAME', axis=1).set_index('LOB')
data.head()

Unnamed: 0_level_0,12-24 Paid Link Ratio,24-36 Paid Link Ratio,36-48 Paid Link Ratio,48-60 Paid Link Ratio,60-72 Paid Link Ratio,72-84 Paid Link Ratio,84-96 Paid Link Ratio,96-108 Paid Link Ratio,108-120 Paid Link Ratio,12-24 Incurred Link Ratio,...,12 Paid to Incurred,24 Paid to Incurred,36 Paid to Incurred,48 Paid to Incurred,60 Paid to Incurred,72 Paid to Incurred,84 Paid to Incurred,96 Paid to Incurred,108 Paid to Incurred,120 Paid to Incurred
LOB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
othliab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ppauto,1.357143,1.0,1.382353,1.0,1.0,1.0,1.0,1.0,1.0,0.640034,...,0.26577,0.742857,0.72973,1.0,1.0,1.0,1.0,1.0,1.0,1.0
comauto,2.625,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.833333,...,0.166667,0.525,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
othliab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ppauto,3.417789,1.222703,1.07027,1.023283,1.021659,1.006826,1.015241,1.0,1.0,1.060302,...,0.392211,0.736617,0.797944,0.942287,0.964349,0.980559,0.981533,0.993047,0.998812,1.0


# Use Case #1

Can we determine which Line of Business a Triangle belongs to.

In [49]:
y = LabelEncoder().fit_transform(data.index)

In [50]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
  data, y,
  test_size = 0.25,
  random_state = 1234)

In [51]:
import lightgbm
model = lightgbm.LGBMClassifier().fit(X=X_train.values, y=y_train)
cm = pd.DataFrame(
    confusion_matrix(y_test, model.predict(X_test)),
    index=LabelEncoder().fit(data.reset_index()['LOB']).classes_,
    columns=LabelEncoder().fit(data.reset_index()['LOB']).classes_)
(cm / cm.sum(1)).style.background_gradient(cmap='Blues', axis=1)

Unnamed: 0,comauto,medmal,othliab,ppauto,prodliab,wkcomp
comauto,0.341463,0.083333,0.169492,0.242424,0.058824,0.21875
medmal,0.02439,0.25,0.101695,0.030303,0.0,0.03125
othliab,0.219512,0.0,0.610169,0.121212,0.352941,0.125
ppauto,0.170732,0.0,0.033898,0.666667,0.0,0.0625
prodliab,0.073171,0.0,0.20339,0.0,0.058824,0.03125
wkcomp,0.04878,0.083333,0.101695,0.181818,0.0,0.53125


## AutoML

Abstraction on underlying techniques

In [52]:
from pycaret.classification import ClassificationExperiment
s = ClassificationExperiment()
s.setup(data.reset_index(), target = 'LOB', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,LOB
2,Target type,Multiclass
3,Target mapping,"comauto: 0, medmal: 1, othliab: 2, ppauto: 3, prodliab: 4, wkcomp: 5"
4,Original data shape,"(775, 49)"
5,Transformed data shape,"(775, 49)"
6,Transformed train set shape,"(542, 49)"
7,Transformed test set shape,"(233, 49)"
8,Numeric features,48
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x7fdf91a2ace0>

In [53]:
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.4833,0.7537,0.4833,0.4669,0.4601,0.328,0.3344,2.8
et,Extra Trees Classifier,0.476,0.7448,0.476,0.4508,0.4473,0.3156,0.326,0.362
rf,Random Forest Classifier,0.4686,0.7672,0.4686,0.4567,0.4403,0.3055,0.315,0.478
lightgbm,Light Gradient Boosting Machine,0.4669,0.7515,0.4669,0.4444,0.4466,0.3106,0.3156,1.174
xgboost,Extreme Gradient Boosting,0.4649,0.7568,0.4649,0.4532,0.4503,0.3116,0.3154,1.053
knn,K Neighbors Classifier,0.3764,0.6692,0.3764,0.3459,0.3512,0.1964,0.2003,0.063
qda,Quadratic Discriminant Analysis,0.3653,0.5983,0.3653,0.2557,0.2954,0.1646,0.1769,0.132
dt,Decision Tree Classifier,0.3579,0.5918,0.3579,0.3618,0.3542,0.1854,0.1869,0.066
lr,Logistic Regression,0.3544,0.6575,0.3544,0.2899,0.2973,0.1551,0.1675,0.366
ada,Ada Boost Classifier,0.3269,0.6485,0.3269,0.3442,0.3064,0.1392,0.1432,0.261


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [57]:
print(best)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=123, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
