In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

train = pd.read_csv('train_features.csv')
train_label = pd.read_csv('train_labels.csv')
test= pd.read_csv('test_features.csv')
submission = pd.read_csv('sample_submission.csv')
pd.set_option('display.max_columns', None)

gb_train = train.drop('time', axis = 1, inplace =False).groupby('id')
gb_test = test.drop('time',axis =1, inplace =False).groupby('id')

In [2]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [3]:
# times 별 집계 계산 함수 features 추가 
def splitTimes(df, Times):
    
    ids = len(df)
    df_t = pd.DataFrame()
    
    for j in range(ids):
        df_agg = pd.DataFrame()
        for i in range(600//Times):
            tmp = df[j].iloc[i*Times : i*Times + Times].drop('time', axis = 1, inplace =False)
            tmp = tmp.groupby('id').agg([percentile(0),percentile(10),percentile(20), percentile(30), 
                      percentile(40), percentile(50),percentile(60), percentile(70),
                                         percentile(80),percentile(90),percentile(100)])
            df_agg = pd.concat([df_agg, tmp], axis = 1)
            
        df_t = df_t.append(df_agg)
        
    return df_t

In [4]:
per_train = gb_train.agg([percentile(0),percentile(10),percentile(20), percentile(30), 
                      percentile(40), percentile(50),percentile(60), percentile(70),
                                         percentile(80),percentile(90),percentile(100)])
per_test = gb_test.agg([percentile(0),percentile(10),percentile(20), percentile(30), 
                      percentile(40), percentile(50),percentile(60), percentile(70),
                                         percentile(80),percentile(90),percentile(100)])

In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [6]:
y = train_label['label']

In [18]:
X_train, X_test , y_train, y_test = train_test_split(per_train, y, test_size = 0.2, 
                                                    random_state = 42)
print( y_train.nunique(), y_test.nunique())

61 61


In [19]:
evals = [(X_test.values, y_test)]

In [20]:
xgb = XGBClassifier(n_estimators = 400, learning_rate = 0.05, 
                    max_depth = 8, 
                    objective = 'multi:softprob', random_state = 0) 

In [10]:
import time
import datetime

In [13]:
start = time.time()

xgb.fit(X_train.values, y_train, eval_set = evals, 
       eval_metric = 'mlogloss',
       early_stopping_rounds = 100)

sec = time.time() - start
print('학습 소요 시간 : ', str(datetime.timedelta(seconds = sec)).split('.') )

[0]	validation_0-mlogloss:3.42662
[1]	validation_0-mlogloss:3.15436
[2]	validation_0-mlogloss:2.95078
[3]	validation_0-mlogloss:2.78745
[4]	validation_0-mlogloss:2.65497
[5]	validation_0-mlogloss:2.53836
[6]	validation_0-mlogloss:2.43605
[7]	validation_0-mlogloss:2.34584
[8]	validation_0-mlogloss:2.26476
[9]	validation_0-mlogloss:2.19176
[10]	validation_0-mlogloss:2.12488
[11]	validation_0-mlogloss:2.06390
[12]	validation_0-mlogloss:2.00595
[13]	validation_0-mlogloss:1.95421
[14]	validation_0-mlogloss:1.90423
[15]	validation_0-mlogloss:1.85911
[16]	validation_0-mlogloss:1.81500
[17]	validation_0-mlogloss:1.77539
[18]	validation_0-mlogloss:1.73823
[19]	validation_0-mlogloss:1.70351
[20]	validation_0-mlogloss:1.67096
[21]	validation_0-mlogloss:1.63888
[22]	validation_0-mlogloss:1.60983
[23]	validation_0-mlogloss:1.58130
[24]	validation_0-mlogloss:1.55440
[25]	validation_0-mlogloss:1.52931
[26]	validation_0-mlogloss:1.50437
[27]	validation_0-mlogloss:1.48148
[28]	validation_0-mlogloss:1.4

In [21]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)

In [22]:
X_over, y_over = smote.fit_sample(per_train, y)

X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, test_size = 0.2, 
                                                    random_state = 42)

In [28]:
evals = [(X_test.values, y_test)]
xgb2 = XGBClassifier(n_estimators = 100, learning_rate = 0.05, 
                    max_depth = 8, 
                    objective = 'multi:softprob', random_state = 0) 

In [29]:
start = time.time()
xgb2.fit(X_train.values, y_train, eval_set = evals, 
       eval_metric = 'mlogloss',
       early_stopping_rounds = 100)

sec = time.time() - start
print('학습 소요 시간 : ', str(datetime.timedelta(seconds = sec)).split('.') )

[0]	validation_0-mlogloss:2.94391
[1]	validation_0-mlogloss:2.56188
[2]	validation_0-mlogloss:2.29923
[3]	validation_0-mlogloss:2.09627
[4]	validation_0-mlogloss:1.93107
[5]	validation_0-mlogloss:1.79154
[6]	validation_0-mlogloss:1.67113
[7]	validation_0-mlogloss:1.56513
[8]	validation_0-mlogloss:1.47067
[9]	validation_0-mlogloss:1.38573
[10]	validation_0-mlogloss:1.30844
[11]	validation_0-mlogloss:1.23765
[12]	validation_0-mlogloss:1.17269
[13]	validation_0-mlogloss:1.11272
[14]	validation_0-mlogloss:1.05716
[15]	validation_0-mlogloss:1.00539
[16]	validation_0-mlogloss:0.95700
[17]	validation_0-mlogloss:0.91191
[18]	validation_0-mlogloss:0.86919
[19]	validation_0-mlogloss:0.82944
[20]	validation_0-mlogloss:0.79159
[21]	validation_0-mlogloss:0.75619
[22]	validation_0-mlogloss:0.72263
[23]	validation_0-mlogloss:0.69091
[24]	validation_0-mlogloss:0.66089
[25]	validation_0-mlogloss:0.63254
[26]	validation_0-mlogloss:0.60564
[27]	validation_0-mlogloss:0.58010
[28]	validation_0-mlogloss:0.5

In [31]:
start = time.time()
xgb2.fit(X_over.values, y_over, eval_set = evals, 
       eval_metric = 'mlogloss',
       early_stopping_rounds = 100)

sec = time.time() - start
print('학습 소요 시간 : ', str(datetime.timedelta(seconds = sec)).split('.') )

[0]	validation_0-mlogloss:2.88839
[1]	validation_0-mlogloss:2.50517
[2]	validation_0-mlogloss:2.24024
[3]	validation_0-mlogloss:2.03636
[4]	validation_0-mlogloss:1.87002
[5]	validation_0-mlogloss:1.72939
[6]	validation_0-mlogloss:1.60773
[7]	validation_0-mlogloss:1.50142
[8]	validation_0-mlogloss:1.40631
[9]	validation_0-mlogloss:1.32066
[10]	validation_0-mlogloss:1.24335
[11]	validation_0-mlogloss:1.17266
[12]	validation_0-mlogloss:1.10796
[13]	validation_0-mlogloss:1.04793
[14]	validation_0-mlogloss:0.99236
[15]	validation_0-mlogloss:0.94046
[16]	validation_0-mlogloss:0.89238
[17]	validation_0-mlogloss:0.84731
[18]	validation_0-mlogloss:0.80501
[19]	validation_0-mlogloss:0.76545
[20]	validation_0-mlogloss:0.72822
[21]	validation_0-mlogloss:0.69316
[22]	validation_0-mlogloss:0.66004
[23]	validation_0-mlogloss:0.62865
[24]	validation_0-mlogloss:0.59896
[25]	validation_0-mlogloss:0.57093
[26]	validation_0-mlogloss:0.54443
[27]	validation_0-mlogloss:0.51924
[28]	validation_0-mlogloss:0.4

In [32]:
y_pred = xgb2.predict_proba(per_test.values)
submission.iloc[:,1:] =y_pred
submission.to_csv('submission파일/over_xgb_per.csv', index = False)

### 제출 결과 : 1.1600021379

In [14]:
train_ids = train.id.nunique()
test_ids = test.id.nunique()

train_df = [] 
for i in range(train_ids):
    train_df.append(train.iloc[600*i : 600*i + 600])
    
test_df = []
for i in range(test_ids):
    test_df.append(test.iloc[600*i : 600*i + 600 ])

In [16]:
train_200 = splitTimes(train_df, 200)
test_200 = splitTimes(test_df, 200)

In [17]:
X_train, X_test , y_train, y_test = train_test_split(train_200, y, test_size = 0.2, 
                                                    random_state = 42)
print( y_train.nunique(), y_test.nunique())

evals = [(X_test.values, y_test)]


start = time.time()
xgb.fit(X_train.values, y_train, eval_set = evals, 
       eval_metric = 'mlogloss',
       early_stopping_rounds = 100)

sec = time.time() - start
print('학습 소요 시간 : ', str(datetime.timedelta(seconds = sec)).split('.') )

61 61
[0]	validation_0-mlogloss:3.48345
[1]	validation_0-mlogloss:3.21176
[2]	validation_0-mlogloss:3.01345
[3]	validation_0-mlogloss:2.85457
[4]	validation_0-mlogloss:2.72340
[5]	validation_0-mlogloss:2.60933
[6]	validation_0-mlogloss:2.50974
[7]	validation_0-mlogloss:2.42234
[8]	validation_0-mlogloss:2.34375
[9]	validation_0-mlogloss:2.27158
[10]	validation_0-mlogloss:2.20393
[11]	validation_0-mlogloss:2.14248
[12]	validation_0-mlogloss:2.08699
[13]	validation_0-mlogloss:2.03585
[14]	validation_0-mlogloss:1.98827
[15]	validation_0-mlogloss:1.94347
[16]	validation_0-mlogloss:1.90260
[17]	validation_0-mlogloss:1.86461
[18]	validation_0-mlogloss:1.82804
[19]	validation_0-mlogloss:1.79369
[20]	validation_0-mlogloss:1.76179
[21]	validation_0-mlogloss:1.73104
[22]	validation_0-mlogloss:1.70121
[23]	validation_0-mlogloss:1.67347
[24]	validation_0-mlogloss:1.64745
[25]	validation_0-mlogloss:1.62270
[26]	validation_0-mlogloss:1.59962
[27]	validation_0-mlogloss:1.57624
[28]	validation_0-mloglo