In [1]:
import pandas as pd
import numpy as np

df_wide = pd.read_csv('./input/ch03/time_series_wide.csv', index_col = 0)
df_wide.index = pd.to_datetime(df_wide.index)

In [2]:
print(df_wide.iloc[:5,:3])

              A     B     C
2016-07-01  532  3314  1136
2016-07-02  798  2461  1188
2016-07-03  823  3522  1711
2016-07-04  937  5451  1977
2016-07-05  881  4729  1975


In [3]:
df_long = df_wide.stack().reset_index(1)

In [4]:
print(df_long)

           level_1     0
2016-07-01       A   532
2016-07-01       B  3314
2016-07-01       C  1136
2016-07-02       A   798
2016-07-02       B  2461
...            ...   ...
2016-12-30       B  4243
2016-12-30       C  2069
2016-12-31       A   869
2016-12-31       B  4703
2016-12-31       C  2233

[552 rows x 2 columns]


In [5]:
df_long.columns = ['id', 'value']

In [6]:
print(df_long)

           id  value
2016-07-01  A    532
2016-07-01  B   3314
2016-07-01  C   1136
2016-07-02  A    798
2016-07-02  B   2461
...        ..    ...
2016-12-30  B   4243
2016-12-30  C   2069
2016-12-31  A    869
2016-12-31  B   4703
2016-12-31  C   2233

[552 rows x 2 columns]


In [7]:
df_wide = df_long.pivot(index=None, columns='id', values='value')

In [8]:
print(df_wide)

id            A     B     C
2016-07-01  532  3314  1136
2016-07-02  798  2461  1188
2016-07-03  823  3522  1711
2016-07-04  937  5451  1977
2016-07-05  881  4729  1975
...         ...   ...   ...
2016-12-27  840  4573  1850
2016-12-28  943  4511  1764
2016-12-29  978  4599  1787
2016-12-30  907  4243  2069
2016-12-31  869  4703  2233

[184 rows x 3 columns]


In [10]:
train = pd.read_csv('./input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('./input/sample-data/test_preprocessed.csv')

In [11]:
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True , random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
tr_x.head()

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_5,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,0,1,0,1,0,0,2015,2,3,24182
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,0,1,1,0,0,2015,5,9,24185
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,0,1,0,1,0,0,2016,2,13,24194
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,0,1,0,0,2015,7,6,24187
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,0,1,1,1,0,2016,9,17,24201


In [12]:
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

params = {'objective':  'binary:logistic', 'silent': 1, 'random_state': 71}
num_round = 50

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist)

va_pred = model.predict(dvalid)
score=log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

pred = model.predict(dtest)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-logloss:0.54088	eval-logloss:0.55003
[1]	train-logloss:0.45269	eval-logloss:0.47182
[2]	train-logloss:0.39482	eval-logloss:0.42026
[3]	train-logloss:0.35198	eval-logloss:0.38520
[4]	train-logloss:0.32021	eval-logloss:0.36150
[5]	train-logloss:0.29673	eval-logloss:0.34463
[6]	train-logloss:0.27610	eval-logloss:0.32900
[7]	train-logloss:0.25886	eval-logloss:0.31670
[8]	train-logloss:0.24363	eval-logloss:0.30775
[9]	train-logloss:0.23153	eval-logloss:0.30093
[10]	train-logloss:0.22016	eval-logloss:0.29413
[11]	train-logloss:0.20963	eval-logloss:0.28528
[12]	train-logloss:0.19951	eval-logloss:0.27912
[13]	train-logloss:0.19324	eval-logloss:0.27642
[14]	train-logloss:0.1854

In [18]:
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71, 'eval_metric': 'logloss'}
num_round= 500
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=20)


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-logloss:0.54088	eval-logloss:0.55003
[1]	train-logloss:0.45269	eval-logloss:0.47182
[2]	train-logloss:0.39482	eval-logloss:0.42026
[3]	train-logloss:0.35198	eval-logloss:0.38520
[4]	train-logloss:0.32021	eval-logloss:0.36150
[5]	train-logloss:0.29673	eval-logloss:0.34463
[6]	train-logloss:0.27610	eval-logloss:0.32900
[7]	train-logloss:0.25886	eval-logloss:0.31670
[8]	train-logloss:0.24363	eval-logloss:0.30775
[9]	train-logloss:0.23153	eval-logloss:0.30093
[10]	train-logloss:0.22016	eval-logloss:0.29413
[11]	train-logloss:0.20963	eval-logloss:0.28528
[12]	train-logloss:0.19951	eval-logloss:0.27912
[13]	train-logloss:0.19324	eval-logloss:0.27642
[14]	train-logloss:0.1854

In [21]:
import lightgbm as lgb
from sklearn.metrics import log_loss

lgb_train = lgb.Dataset(tr_x, tr_y)
lgb_eval = lgb.Dataset(va_x, va_y)

params = {'objective': 'binary', 'seed': 71, 'verbose': 0, 'metrics': 'binary_logloss'}
num_round = 100

categorical_features = ['product', 'medical_info_b2', 'medical_info_b3']
model = lgb.train(params, lgb_train , num_boost_round=num_round, categorical_feature=categorical_features, valid_names=['train', 'valid'], valid_sets=[lgb_train, lgb_eval])

va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(f'logloss: {score: .4f}')

pred = model.predict(test_x)

New categorical_feature is ['medical_info_b2', 'medical_info_b3', 'product']


You can set `force_col_wise=true` to remove the overhead.
[1]	train's binary_logloss: 0.454286	valid's binary_logloss: 0.4654
[2]	train's binary_logloss: 0.429348	valid's binary_logloss: 0.443537
[3]	train's binary_logloss: 0.409269	valid's binary_logloss: 0.425588
[4]	train's binary_logloss: 0.393109	valid's binary_logloss: 0.411213
[5]	train's binary_logloss: 0.379351	valid's binary_logloss: 0.399341
[6]	train's binary_logloss: 0.366138	valid's binary_logloss: 0.389055
[7]	train's binary_logloss: 0.35417	valid's binary_logloss: 0.378254
[8]	train's binary_logloss: 0.343782	valid's binary_logloss: 0.370131
[9]	train's binary_logloss: 0.334283	valid's binary_logloss: 0.362036
[10]	train's binary_logloss: 0.324802	valid's binary_logloss: 0.353452
[11]	train's binary_logloss: 0.316592	valid's binary_logloss: 0.346904
[12]	train's binary_logloss: 0.308484	valid's binary_logloss: 0.340248
[13]	train's binary_logloss: 0.301468	valid's binary_logloss: 0.335801
[14]	train's binary_logloss: 0.

In [27]:
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('./input/sample-data/train_preprocessed_onehot.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('./input/sample-data/test_preprocessed_onehot.csv')

# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
scaler = StandardScaler()
tr_x = scaler.fit_transform(tr_x)
va_x = scaler.transform(va_x)
test_x = scaler.transform(test_x)

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(train_x.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 128
epochs = 10
history = model.fit(tr_x, tr_y, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(va_x, va_y))

va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred, eps=1e-7)
print(f'logloss: {score: .4f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
logloss:  0.3015


In [29]:
from keras.callbacks import EarlyStopping

epochs  = 50
early_stopping = EarlyStopping(monitor='val_loss', patience = 20, restore_best_weights=True)

history = model.fit(tr_x, tr_y, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(va_x, va_y), callbacks=[early_stopping])
pred=model.predict(test_x)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50


In [30]:

train = pd.read_csv('./input/sample-data/train_preprocessed_onehot.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('./input/sample-data/test_preprocessed_onehot.csv')

from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
tr_x = scaler.fit_transform(tr_x)
va_x = scaler.transform(va_x)
test_x = scaler.transform(test_x)

model = LogisticRegression(C=1)
model.fit(tr_x, tr_y)
va_pred = model.predict_proba(va_x)
score = log_loss(va_y, va_pred)
print(f'logloss: {score:.4f}')

pred = model.predict(test_x)

logloss: 0.3720


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').