In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()
train_x, test_x = train_test_split(data.data, train_size=0.8, shuffle=False)
train_y, test_y = train_test_split(data.target, train_size=0.8, shuffle=False)
train_x = pd.DataFrame(train_x, columns=data['feature_names'])
test_x = pd.DataFrame(test_x, columns=data['feature_names'])
train_y = pd.DataFrame(train_y)
test_y = pd.DataFrame(test_y)

print(train_x.shape)
print(train_y.shape)

(120, 4)
(120, 1)


## Stratified k-fold

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x, train_y):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    scaler = StandardScaler()
    tr_x = scaler.fit_transform(tr_x)
    va_x = scaler.fit_transform(va_x)
    test_x = scaler.fit_transform(test_x)
    
    model = LogisticRegression(C=1.0)
    model.fit(tr_x, np.ravel(tr_y.values))
    
    va_pred = model.predict_proba(va_x)
    score = log_loss(va_y, va_pred)
    print(f'logloss {score:.4f}')
    
    pred = model.predict(test_x)
    
    acc = accuracy_score(test_y.values, pred)
    print(f'acc {acc:.4f}')

logloss 0.1334
acc 0.0667
logloss 0.1410
acc 0.0333
logloss 0.1451
acc 0.0333
logloss 0.0918
acc 0.0000


## Leave-one-out

In [8]:
from sklearn.model_selection import LeaveOneOut

kf = LeaveOneOut()
i = 0
for tr_idx, va_idx in kf.split(train_x):
    i+=1
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
print(f'count: {i}')

count: 120


In [38]:
data = pd.read_csv('input/sample-data/train.csv')

data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
data = data.sort_values('date').reset_index()

train_x, test_x = train_test_split(data, train_size=0.8, shuffle=False)

train_y = train_x['target']
train_x = train_x.drop('target', axis=1)

test_y = test_x['target']
test_x = test_x.drop('target', axis=1)

## 時系列データのCV

In [41]:
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits=4)
for tr_idx, va_idx in tss.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

In [63]:
train_len = len(train_x)
period = 0
train_split = int(train_len/4)
train_x['period'] = 0
for i in range(0, train_len, train_split):
    train_x.loc[i:i+train_split, 'period'] = period
    period += 1
train_x['period'].value_counts()

va_period_list = [0, 1, 2, 3]
for va_period in va_period_list:
    is_tr = train_x['period'] != va_period
    is_va = train_x['period'] == va_period
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]