# Raw Data Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from matplotlib import rc

rc('font', family = 'NanumMyeongjo')
plt.rcParams['axes.unicode_minus'] = False

In [19]:
train = pd.read_csv('train_features.csv')
test = pd.read_csv('test_features.csv')
train_label = pd.read_csv('train_labels.csv')
submission = pd.read_csv('sample_submission.csv')

In [4]:
df = pd.merge(train, train_label, on ='id', how = 'outer')

In [5]:
df

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,label,label_desc
0,0,0,1.206087,-0.179371,-0.148447,-0.591608,-30.549010,-31.676112,37,Shoulder Press (dumbbell)
1,0,1,1.287696,-0.198974,-0.182444,0.303100,-39.139103,-24.927216,37,Shoulder Press (dumbbell)
2,0,2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629,37,Shoulder Press (dumbbell)
3,0,3,1.293095,-0.230366,-0.215210,2.712986,-53.597843,-27.454013,37,Shoulder Press (dumbbell)
4,0,4,1.300887,-0.187757,-0.222523,4.286707,-57.906561,-27.961234,37,Shoulder Press (dumbbell)
...,...,...,...,...,...,...,...,...,...,...
1874995,3124,595,-0.712530,-0.658357,0.293707,-29.367857,-104.013664,-76.290437,2,Bicep Curl
1874996,3124,596,-0.683037,-0.658466,0.329223,-30.149089,-101.796809,-76.625087,2,Bicep Curl
1874997,3124,597,-0.664730,-0.666625,0.364114,-27.873095,-98.776072,-79.365125,2,Bicep Curl
1874998,3124,598,-0.630534,-0.682565,0.373696,-23.636550,-99.139495,-80.259478,2,Bicep Curl


In [6]:
y = df['label']
X = df[['acc_x','acc_y','acc_z','gy_x','gy_y','gy_z']]

In [7]:
y.shape

(1875000,)

In [8]:
X.shape

(1875000, 6)

In [9]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, accuracy_score

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [11]:
y_train.nunique()

61

In [13]:
y_test.nunique()

61

In [14]:
evals = [(X_test, y_test)]

In [15]:
xgb_wrapper = XGBClassifier(n_estimators = 400, learning_rate = 0.05, 
                            max_depth = 6,objective = 'multi:softprob')
                           
xgb_wrapper.fit(X_train, y_train,
                eval_metric = 'mlogloss', 
                eval_set= evals, 
                early_stopping_rounds = 100)

[0]	validation_0-mlogloss:3.59973
[1]	validation_0-mlogloss:3.36773
[2]	validation_0-mlogloss:3.20345
[3]	validation_0-mlogloss:3.07198
[4]	validation_0-mlogloss:2.96342
[5]	validation_0-mlogloss:2.87158
[6]	validation_0-mlogloss:2.79093
[7]	validation_0-mlogloss:2.71861
[8]	validation_0-mlogloss:2.65434
[9]	validation_0-mlogloss:2.59587
[10]	validation_0-mlogloss:2.54217
[11]	validation_0-mlogloss:2.49350
[12]	validation_0-mlogloss:2.44845
[13]	validation_0-mlogloss:2.40638
[14]	validation_0-mlogloss:2.36765
[15]	validation_0-mlogloss:2.33109
[16]	validation_0-mlogloss:2.29715
[17]	validation_0-mlogloss:2.26527
[18]	validation_0-mlogloss:2.23497
[19]	validation_0-mlogloss:2.20665
[20]	validation_0-mlogloss:2.17963
[21]	validation_0-mlogloss:2.15419
[22]	validation_0-mlogloss:2.12989
[23]	validation_0-mlogloss:2.10738
[24]	validation_0-mlogloss:2.08547
[25]	validation_0-mlogloss:2.06475
[26]	validation_0-mlogloss:2.04503
[27]	validation_0-mlogloss:2.02624
[28]	validation_0-mlogloss:2.0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

집계함수 값을 활용한 데이터셋을 사용했을 땐 XGBoost 모델이 효과적이었으나,  
Raw data 전체를 사용할 경우, 데이터양이 600배 증가함에 따라 수행시간도 N시간이 소요됨.  
따라서 Xgboost 모델을 사용할 때 n_estimators를 줄이거나, 학습률 증가 등 파라미터 조정이 필요함.  
해당 셀은 28시간이 넘게 걸림.... 
stop 누르고 싶은 마음이 가득하다. 

In [16]:
y_pred = xgb_wrapper.predict_proba(X_test)
print('log loss : ', log_loss(y_test, y_pred))

log loss :  1.2840915686146008


In [24]:
y_pred.shape

(375000, 61)

In [28]:
df

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z,label,label_desc
0,0,0,1.206087,-0.179371,-0.148447,-0.591608,-30.549010,-31.676112,37,Shoulder Press (dumbbell)
1,0,1,1.287696,-0.198974,-0.182444,0.303100,-39.139103,-24.927216,37,Shoulder Press (dumbbell)
2,0,2,1.304609,-0.195114,-0.253382,-3.617278,-44.122565,-25.019629,37,Shoulder Press (dumbbell)
3,0,3,1.293095,-0.230366,-0.215210,2.712986,-53.597843,-27.454013,37,Shoulder Press (dumbbell)
4,0,4,1.300887,-0.187757,-0.222523,4.286707,-57.906561,-27.961234,37,Shoulder Press (dumbbell)
...,...,...,...,...,...,...,...,...,...,...
1874995,3124,595,-0.712530,-0.658357,0.293707,-29.367857,-104.013664,-76.290437,2,Bicep Curl
1874996,3124,596,-0.683037,-0.658466,0.329223,-30.149089,-101.796809,-76.625087,2,Bicep Curl
1874997,3124,597,-0.664730,-0.666625,0.364114,-27.873095,-98.776072,-79.365125,2,Bicep Curl
1874998,3124,598,-0.630534,-0.682565,0.373696,-23.636550,-99.139495,-80.259478,2,Bicep Curl


In [47]:
test

Unnamed: 0,id,time,acc_x,acc_y,acc_z,gy_x,gy_y,gy_z
0,3125,0,-0.628100,-0.160155,0.151487,49.665357,88.435961,13.597668
1,3125,1,-0.462548,0.012462,-0.053726,56.953059,96.185341,16.278458
2,3125,2,-0.363481,-0.091789,-0.130004,29.557396,93.836453,13.329043
3,3125,3,-0.351750,-0.239870,-0.193053,23.686172,88.608721,13.449771
4,3125,4,-0.312934,-0.123762,-0.318621,20.410071,85.327707,13.884912
...,...,...,...,...,...,...,...,...
469195,3906,595,0.104191,-0.784979,0.639513,-10.475346,14.095361,-190.358982
469196,3906,596,0.103297,-0.758954,0.615687,-25.360272,-8.523018,-180.393291
469197,3906,597,0.128294,-0.749389,0.586184,-27.917723,-23.186245,-162.624160
469198,3906,598,0.104130,-0.692731,0.573397,-27.847980,-30.407555,-138.761676


In [43]:
test_f = test[['acc_x','acc_y','acc_z','gy_x','gy_y','gy_z']]

In [44]:
y_pred = xgb_wrapper.predict_proba(test_f)

submission을 위해 테스트 데이터로 해당 모델로 predict 한 결과 각 id 별로 600개 값이 존재해 제공된 submission 데이터 프레임과 맞지 않음.   
예측된 값의 평균으로 submission 값으로 저장.  

In [56]:
y_pred

array([[1.36316940e-03, 8.01303613e-05, 1.69148995e-03, ...,
        1.72061585e-02, 1.86940050e-03, 1.67856843e-03],
       [1.45742204e-03, 4.36491508e-04, 3.31648887e-04, ...,
        2.92640878e-04, 2.34261993e-03, 9.76187119e-04],
       [2.53158039e-03, 4.88976657e-04, 5.23612252e-04, ...,
        1.39111688e-03, 4.41191136e-04, 1.38119713e-03],
       ...,
       [1.42004574e-03, 3.58801824e-03, 2.04276876e-03, ...,
        1.68267852e-05, 1.30686015e-02, 2.97697261e-03],
       [1.36319303e-03, 3.64622404e-03, 5.53244073e-03, ...,
        2.14922457e-05, 1.48219028e-02, 1.18271215e-02],
       [2.44486541e-03, 5.26721543e-03, 8.26635398e-03, ...,
        3.71879833e-05, 1.25481291e-02, 7.56804505e-03]], dtype=float32)

In [53]:
pred = pd.DataFrame(y_pred, index = test.id).groupby('id').mean()

In [54]:
pred

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3125,0.003449,0.000062,0.000931,0.003083,0.018060,0.000421,0.000006,0.000297,0.000594,0.069489,...,0.015747,0.004294,0.004336,0.001438,0.001958,0.000068,0.004157,0.054684,0.000227,0.010679
3126,0.004263,0.029233,0.000049,0.007901,0.012441,0.013548,0.000055,0.006757,0.004620,0.014349,...,0.002799,0.000047,0.003560,0.007881,0.004894,0.008646,0.000882,0.001692,0.005981,0.009625
3127,0.002431,0.024255,0.000027,0.008347,0.000927,0.004695,0.001717,0.030282,0.006211,0.000107,...,0.000446,0.000008,0.000233,0.007008,0.001529,0.017098,0.000042,0.000016,0.018437,0.002558
3128,0.009688,0.003575,0.019961,0.005380,0.004720,0.004631,0.000006,0.004719,0.000946,0.002391,...,0.004174,0.006508,0.001120,0.002669,0.019398,0.002181,0.012559,0.015291,0.001488,0.014054
3129,0.002873,0.002965,0.002997,0.006552,0.009898,0.005457,0.000004,0.000524,0.004553,0.004095,...,0.002565,0.000031,0.001171,0.000094,0.005614,0.000292,0.010269,0.001200,0.003573,0.004279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3902,0.003295,0.000713,0.000028,0.009505,0.020922,0.003621,0.000011,0.006978,0.000489,0.024125,...,0.003018,0.000114,0.010128,0.002708,0.002319,0.003902,0.001180,0.000231,0.006244,0.008141
3903,0.004704,0.005731,0.000255,0.005819,0.016970,0.010324,0.001720,0.004348,0.001552,0.007717,...,0.006917,0.000110,0.006811,0.001821,0.003734,0.011956,0.004399,0.000790,0.013011,0.006611
3904,0.003112,0.001699,0.002125,0.005098,0.010835,0.005355,0.000004,0.002891,0.010699,0.012627,...,0.005926,0.000195,0.002120,0.000140,0.005200,0.001639,0.010588,0.004103,0.001225,0.006921
3905,0.002165,0.006213,0.002139,0.009750,0.000892,0.001644,0.026070,0.006721,0.000747,0.000070,...,0.001280,0.000033,0.000067,0.024909,0.023789,0.013866,0.000067,0.000123,0.048659,0.000220


In [55]:
submission.iloc[:,1:] = pred
submission.to_csv('submission파일/raw_xgb400.csv', index = False)

제출 결과값 : 4.11    
전체 데이터셋으로 할 경우, submission 의 데이터셋과 맞지 않음.  

In [None]:
from lightgbm import LGBMClassifier
import time
import datetime

In [None]:
start = time.time()
lgbm_wrapper = LGBMClassifier(n_estimators = 100, learning_rate = 0.1, random_state = 0)
lgbm_wrapper.fit(X_train,y_train,eval_set = evals, 
                early_stopping_rounds = 30)

sec = time.time()-start
times = str(datetime.timedelta(second = sec)).split('.')
times = times[0]
print('수행 시간 : ', times)