In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

df_abalone = pd.read_csv('data/abalone.csv')
df_abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole,Shucked,Viscera,Shell,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [2]:
a, b= 1, 5
X = np.random.random(size=100) * 10
y = X * a + b + np.random.normal(size=100)

In [3]:
pd.DataFrame(
    {'X': X, 'y': y}
).assign(
    y_hat = lambda x: X * a + b
).assign(
    error = lambda x: x['y'] - x['y_hat'] # error
).assign(
    absolute_error = lambda x: x['error'].abs(),
    squared_error = lambda x: x['error'] ** 2
)

Unnamed: 0,X,y,y_hat,error,absolute_error,squared_error
0,2.109345,7.286139,7.109345,0.176794,0.176794,0.031256
1,4.155836,8.142500,9.155836,-1.013336,1.013336,1.026851
2,8.482934,15.125248,13.482934,1.642313,1.642313,2.697194
3,7.719945,10.242150,12.719945,-2.477796,2.477796,6.139472
4,6.451890,12.377333,11.451890,0.925443,0.925443,0.856444
...,...,...,...,...,...,...
95,9.373093,14.810884,14.373093,0.437791,0.437791,0.191661
96,6.194684,10.188406,11.194684,-1.006278,1.006278,1.012595
97,7.476520,11.933136,12.476520,-0.543383,0.543383,0.295266
98,5.801961,10.696426,10.801961,-0.105535,0.105535,0.011138


In [5]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_abalone, test_size=0.2, stratify=df_abalone['Sex'], random_state=123)
len(df_train), len(df_test)

(3341, 836)

In [15]:
df_train['Sex'].value_counts()/len(df_train['Sex'])

M    0.365759
I    0.321161
F    0.313080
Name: Sex, dtype: float64

In [16]:
df_test['Sex'].value_counts()/len(df_test['Sex'])

M    0.366029
I    0.321770
F    0.312201
Name: Sex, dtype: float64

In [24]:
from sklearn.model_selection import StratifiedKFold, ShuffleSplit, cross_validate
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import accuracy_score 

y_target = 'Sex'
X_cols = [i for i in df_train.columns if i != y_target] # y_target 빼고 모든 columns
# 5-fold stratified cv  
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

def eval_model(clf):
    # clf: model
    result = cross_validate(clf, df_train[X_cols], df_train[y_target], cv=cv, scoring='accuracy', return_train_score=True)
    return np.mean(result['train_score']), np.std(result['train_score']), np.mean(result['test_score']), np.std(result['test_score'])

def test_model(clf):
    prd = clf.predict(df_test[X_cols])
    return accuracy_score(df_test[y_target], prd)

In [25]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import mean_squared_error 

clf_lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(solver='lbfgs', multi_class='multinomial') # 다중 클래스 확률을 softmax로 계산토록 합니다
)
eval_model(clf_lr)

(0.5701132247289932,
 0.0075453299897091,
 0.5603067407785326,
 0.01647066710990199)

In [26]:
clf_lr.fit(df_train[X_cols], df_train[y_target])
test_model(clf_lr)

0.5299043062200957

In [28]:
from sklearn.ensemble import RandomForestClassifier 

clf_rf = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=123)
eval_model(clf_rf)

(0.7577070046215089,
 0.0047905876891228315,
 0.5627068732490177,
 0.009384357819423742)

In [29]:
from sklearn.ensemble import VotingClassifier

# Voting Classifier를 clf_lr(Logistic Regression)과 clf_rf(Random Forest)로 구성합니다
clf_vt = VotingClassifier([
    ('lr', clf_lr),
    ('rf', clf_rf)
], voting='soft')
eval_model(clf_vt)

(0.6715058099289636,
 0.006159987110256223,
 0.5645001476866893,
 0.015567872193416429)

In [30]:
clf_vt.fit(df_train[X_cols], df_train[y_target])
test_model(clf_vt)

0.5358851674641149

[Ex.5]

In [44]:
df_train2 = df_abalone.loc[df_abalone['Sex'] != 'I']
df_test2 = df_abalone.loc[df_abalone['Sex'] == 'I']

from sklearn.model_selection import GroupKFold
y_target2 = 'Rings'
grp = 'Sex'
X_cols2 = [i for i in df_train2.columns if i != y_target2 and i != grp]
cv2 = GroupKFold(n_splits=2)

def eval_model2(reg):
    result = cross_validate(
        reg, df_train2[X_cols], df_train2[y_target2], 
        cv=cv2, groups=df_train2[grp], scoring='neg_mean_squared_error', return_train_score=True
    )
    return np.mean(result['train_score']), np.std(result['train_score']), np.mean(result['test_score']), np.std(result['test_score'])

def test_model2(reg):
    y_preds = reg.predict(df_test2[X_cols2])
    return mean_squared_error(df_test2['Rings'], y_preds)

In [45]:
from IPython.display import display
from sklearn.model_selection import GroupKFold

df = pd.DataFrame({'X': X, 'y': y}, columns=['X', 'y'])
display(df.head())
df.loc[:int(len(df)*3/4), 'class'] = 'a'
df.loc[int(len(df)*3/4):, 'class'] = 'b'
display(df)

gkf = GroupKFold(n_splits=2)
for train_index, test_index in gkf.split(df, groups=df['class']):
     print("%s %s" % (train_index, test_index))

Unnamed: 0,X,y
0,1,2
1,2,4
2,3,6
3,4,8
4,5,10


Unnamed: 0,X,y,class
0,1,2,a
1,2,4,a
2,3,6,a
3,4,8,a
4,5,10,a
5,6,12,a
6,7,14,a
7,8,16,a
8,9,18,a
9,10,20,a


[12 13 14 15] [ 0  1  2  3  4  5  6  7  8  9 10 11]
[ 0  1  2  3  4  5  6  7  8  9 10 11] [12 13 14 15]


In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
reg_lr = make_pipeline(
    StandardScaler(),
    LinearRegression()
)
reg_rf = RandomForestRegressor(n_estimators=300, max_depth=7, random_state=123)
eval_model2(reg_lr), eval_model2(reg_rf)

((-2.1635972709489185e-30,
  5.301182761064568e-31,
  -2.0825053200980617e-30,
  2.138646448384962e-31),
 (-0.0017693922686852445,
  0.0007831793542000766,
  -0.015088361192117624,
  0.002397705287521921))

In [50]:
reg_lr.fit(df_train2[X_cols2], df_train2[y_target2])
reg_rf.fit(df_train2[X_cols2], df_train2[y_target2])
test_model2(reg_lr), test_model2(reg_rf)

(4.60917470896092, 4.044116723974344)

In [51]:
from sklearn.ensemble import VotingRegressor

reg_vt = VotingRegressor([
    ('lr', reg_lr),
    ('rf', reg_rf)
])
eval_model2(reg_vt)

(-0.0004423480671713111,
 0.00019579483855002348,
 -0.0037720902980294094,
 0.0005994263218804751)

In [54]:
reg_vt.fit(df_train2[X_cols2], df_train2[y_target2])
test_model2(reg_vt)

4.050880582683106