# 学習曲線

scikit-learnを使用して学習曲線を描画する

## Data Load

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mlp
import seaborn as sns
import numpy as np

pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')

電気通信事業者の解約データを読み込む  
(https://www.kaggle.com/blastchar/telco-customer-churn)

In [2]:
input_path = '../data'
df = pd.read_csv(os.path.join(input_path, 'WA_Fn-UseC_-Telco-Customer-Churn.csv'))

# TotalCharges列に空文字が存在して文字列型になっているので欠損値に置換して少数型にしておく
col = 'TotalCharges'
df[col] = df[col].replace({' ': np.nan}).astype(float)

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Partition

In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, train_size=0.8, random_state=2021, shuffle=True)
print('original_size:', df.shape)
print('train_size:', train_df.shape)
print('test_size:', test_df.shape)

original_size: (7043, 21)
train_size: (5634, 21)
test_size: (1409, 21)


## Data Preparation

In [4]:
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder

In [5]:
# ターゲットを変換
train_df['Churn'] = train_df['Churn'].map({'Yes':1, 'No':0})
test_df['Churn'] = test_df['Churn'].map({'Yes':1, 'No':0})

# 数値変数の標準化
# 今回はツリー系アルゴリズムを使用する本来は不要だが、勉強のために実施しておく
num_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# customerID以外のカテゴリ変数をOrdinalエンコーディング
cat_cols = []
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        cat_cols.append(col)
cat_cols.remove('customerID')
encoder = OrdinalEncoder()
train_df[cat_cols] = encoder.fit_transform(train_df[cat_cols])
test_df[cat_cols] = encoder.transform(test_df[cat_cols])

# 欠損値をトレーニングデータの中央値で保管
train_df.fillna(train_df.median(), inplace=True)
test_df.fillna(train_df.median(), inplace=True)

# 前処理後のデータプレビュー
train_df.head()

  train_df.fillna(train_df.median(), inplace=True)
  test_df.fillna(train_df.median(), inplace=True)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
6125,0871-URUWO,1,0,1,1,-0.79199,1,1,1,1,1,1,1,1,1,1,1,1,1.255598,-0.403651,1
6958,3078-ZKNTS,2,0,1,2,-0.79199,1,2,2,2,2,2,2,2,2,2,1,2,-1.485131,-0.894834,0
4062,1915-IOFGU,2,0,2,1,-1.280574,1,2,1,1,1,3,1,3,3,1,2,3,0.200833,-0.972643,1
5298,5647-FXOTP,2,1,1,1,1.121629,1,1,1,1,3,1,1,1,1,1,1,3,1.376855,1.822967,0
1214,9866-QEVEE,1,0,2,1,-0.547698,1,1,1,1,1,3,1,3,1,1,1,2,0.715758,-0.327057,1


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5634 entries, 6125 to 1140
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5634 non-null   object 
 1   gender            5634 non-null   int64  
 2   SeniorCitizen     5634 non-null   int64  
 3   Partner           5634 non-null   int64  
 4   Dependents        5634 non-null   int64  
 5   tenure            5634 non-null   float64
 6   PhoneService      5634 non-null   int64  
 7   MultipleLines     5634 non-null   int64  
 8   InternetService   5634 non-null   int64  
 9   OnlineSecurity    5634 non-null   int64  
 10  OnlineBackup      5634 non-null   int64  
 11  DeviceProtection  5634 non-null   int64  
 12  TechSupport       5634 non-null   int64  
 13  StreamingTV       5634 non-null   int64  
 14  StreamingMovies   5634 non-null   int64  
 15  Contract          5634 non-null   int64  
 16  PaperlessBilling  5634 non-null   int64

## Learning Curve

LightGBMでモデルを作成して学習曲線を描画する 

In [29]:
# 学習曲線を描画するのに使用するライブラリを読み込み
import lightgbm as lgb
from sklearn.model_selection import learning_curve

In [32]:
# ターゲットと特徴量に分離
train_x = train_df.drop(['customerID','Churn'], axis=1)
train_y = train_df['Churn']
test_x = test_df.drop(['customerID','Churn'], axis=1)
test_y = test_df['Churn']

# EarlyStopping用にさらにデータを分割する
train_x_, val_x, train_y_, val_y = train_test_split(train_x, train_y, train_size=0.8, random_state=888, shuffle=True)

# scikit-learnを使って学習曲線を描画するため、LightGBMのオリジナルAPIではなく、sklearnのラッパーAPIを使用する
model = lgb.LGBMClassifier(objective='binary', max_depth=-1, random_state=777, n_estimators=9999, verbose=0,
                        boosting_type='gbdt', importance_type='gain', learning_rate=0.02,
                        num_leaves=15, subsample=0.8, colsample_bytree=0.8,
                        reg_alpha=0, reg_lambda=0, min_child_weight=1e-3,
                        min_child_samples=1, subsample_freq=5, force_row_wise=True)

# model.fit(train_x_, train_y_, early_stopping_rounds=200, eval_metric='logloss', eval_set=[(val_x, val_y)])

# 学習曲線を描画するために、トレーニングデータの20%, 40%, 60%, 80%, 100%を使用して、それぞれ３分割クロスバリデーションでモデルを作成する
# 各データ量でのモデルのスコア（AUC）が返ってくる
train_sizes, train_scores, valid_scores = learning_curve(estimator=model, X=train_x, y=train_y, 
                                                         train_sizes=np.linspace(0.1, 1.0, 5), cv=3)



In [35]:
# learning_curve()関数の戻り値を使用して学習曲線を描画する関数を定義
def plot_learning_curve(train_size, train_scores, valid_scores):
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    valid_scores_mean = np.mean(valid_scores, axis=1)
    valid_scores_std = np.std(valid_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, valid_scores_mean - valid_scores_std,
                     valid_scores_mean + valid_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, valid_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    
    return plt

In [None]:
# 学習曲線を描画
plot_learning_curve(train_sizes, train_scores, valid_scores)