In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams

In [2]:
# 表示の幅を広げる
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 500)
rcParams['figure.figsize'] = 8, 8 # グラフが見きれないようにするためサイズを大きくしておく

In [3]:
# データの読み込み
df_train = pd.read_csv('df_train.csv')
df_train.head() # カテゴリカル変数はなし

Unnamed: 0,ID,ACTIVITY,TIME,SL,EEG,BP,HR,CIRCLUATION
0,0,4,1954.23,55.5372,-368.0,97,33,5
1,1,0,17450.0,120164.0,-7539.0,66,347,4535
2,2,0,12024.5,19634.0,-3361.0,86,167,973
3,3,4,13413.0,46643.3,-5735.43,65,273,2061
4,4,2,5325.98,3491.26,-809.996,50,51,132


In [4]:
# 次元を確認
df_train.shape 

(14744, 8)

In [5]:
# 欠損値に関して
df_train.isnull().sum()

ID             0
ACTIVITY       0
TIME           0
SL             0
EEG            0
BP             0
HR             0
CIRCLUATION    0
dtype: int64

In [6]:
# 基本統計量の把握
df_train[df_train['ACTIVITY'] == 0].describe()
# HRの最大値がでかい

Unnamed: 0,ID,ACTIVITY,TIME,SL,EEG,BP,HR,CIRCLUATION
count,4144.0,4144.0,4144.0,4144.0,4144.0,4144.0,4144.0,4144.0
mean,7431.656612,0.0,12591.616006,115457.3,-11992.26,72.125241,258.43195,4047.658542
std,4261.905552,0.0,5841.193613,186134.1,213848.9,68.109948,148.467966,5003.893847
min,1.0,0.0,3294.91,409.0,-12626000.0,0.0,35.0,35.0
25%,3716.75,0.0,8370.6175,16487.75,-8076.738,27.0,146.0,890.0
50%,7427.5,0.0,11325.6,51905.5,-5279.5,50.0,224.5,2325.0
75%,11144.5,0.0,15438.3,134770.0,-3209.75,91.0,333.0,5322.0
max,14735.0,0.0,50895.5,2426140.0,-90.0,506.0,986.0,52210.0


In [7]:
df_train[df_train['ACTIVITY'] == 1].describe()

Unnamed: 0,ID,ACTIVITY,TIME,SL,EEG,BP,HR,CIRCLUATION
count,458.0,458.0,458.0,458.0,458.0,458.0,458.0,458.0
mean,7178.146288,1.0,6589.688799,17781.711444,-2267.538325,40.644105,118.072052,918.229258
std,4358.347644,0.0,3161.580207,25923.767237,1767.509162,46.899002,77.282491,1236.274594
min,13.0,1.0,2202.39,46.0085,-5880.0,0.0,33.0,5.0
25%,3457.5,1.0,4150.305,2263.4125,-4196.9825,13.0,54.0,165.0
50%,7128.0,1.0,5711.195,4767.42,-1460.0,21.0,85.0,343.0
75%,10992.0,1.0,9210.4475,17210.625,-917.5,35.0,183.0,931.0
max,14715.0,1.0,18117.0,143758.0,-119.0,150.0,449.0,5537.0


In [8]:
df_train[df_train['ACTIVITY'] == 5].describe()
# SLの平均値が高い

Unnamed: 0,ID,ACTIVITY,TIME,SL,EEG,BP,HR,CIRCLUATION
count,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0,1524.0
mean,7356.812992,5.0,10339.166385,51015.052655,-3640.910748,56.026247,192.870735,2177.274934
std,4215.15336,0.0,4350.68919,73701.184359,2235.958982,35.20954,101.408374,2450.85519
min,15.0,5.0,2017.36,48.09,-13400.0,0.0,33.0,5.0
25%,3646.0,5.0,7914.935,9368.2125,-4912.5,30.0,130.0,517.0
50%,7505.0,5.0,9470.5,30388.0,-3251.9,46.0,179.0,1747.0
75%,10904.5,5.0,12443.825,53891.325,-2088.57,75.0,262.0,2599.0
max,14731.0,5.0,27054.1,482830.0,4950.0,186.0,540.0,15505.0


In [9]:
df_train[df_train['ACTIVITY'] == 2].describe()
# EEGの最大値がでかい
# 

Unnamed: 0,ID,ACTIVITY,TIME,SL,EEG,BP,HR,CIRCLUATION
count,2261.0,2261.0,2261.0,2261.0,2261.0,2261.0,2261.0,2261.0
mean,7442.908448,2.0,9176.841827,49866.161553,-3049.441965,50.708978,168.885449,2047.480318
std,4260.345169,0.0,5068.905955,85192.388157,2550.66122,35.397493,121.661843,2992.65594
min,4.0,2.0,2120.99,45.7832,-14500.0,0.0,33.0,5.0
25%,3677.0,2.0,5320.82,4217.6,-4120.0,20.0,79.0,292.0
50%,7564.0,2.0,7569.59,9877.02,-2210.0,42.0,128.0,587.0
75%,11107.0,2.0,12253.9,60244.4,-1200.0,76.0,235.0,2904.0
max,14741.0,2.0,26195.7,476183.0,9640.0,170.0,537.0,15505.0


In [10]:
df_train[df_train['ACTIVITY'] == 3].describe()

Unnamed: 0,ID,ACTIVITY,TIME,SL,EEG,BP,HR,CIRCLUATION
count,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0,3218.0
mean,7327.045991,3.0,11275.797887,72844.889598,-3372.357,51.098508,215.197949,2902.607831
std,4240.369392,0.0,5105.104854,95974.274839,25172.72,34.086469,123.020704,3195.029661
min,6.0,3.0,2275.3,42.2242,-15200.0,0.0,33.0,5.0
25%,3700.25,3.0,7465.59,12017.35,-5050.0,25.0,128.0,689.0
50%,7197.5,3.0,10075.6,39116.95,-3107.055,42.0,196.0,1885.0
75%,10958.25,3.0,13818.275,87562.55,-2230.805,74.0,254.0,3630.0
max,14743.0,3.0,26626.3,480315.0,1410000.0,207.0,540.0,15505.0


In [11]:
# 説明変数と目的変数にデータを切り分ける
x = df_train.drop(['ID','ACTIVITY'], axis = 1)
t = df_train['ACTIVITY']
type(x), type(t)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [12]:
# 訓練データと検証データに切り分ける
from sklearn.model_selection import train_test_split
x_train, x_val, t_train, t_val = train_test_split(x, t, test_size = 0.3, random_state = 1) 
len(x_train), len(x_val), len(x)

(10320, 4424, 14744)

In [12]:
# 試しに線形回帰する
from sklearn.linear_model import LinearRegression

In [13]:
model = LinearRegression()
model.fit(x_train, t_train)
model.score(x_train, t_train), model.score(x_val, t_val) # ゴミスコア

(0.05750703536412072, 0.05167337512448644)

## 改善

In [13]:
# モデルの宣言
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'

In [None]:
import xgboost as xgb

In [None]:
model1 = XGBClassifier(max_depth=10, random_state = 0) # 再現性の確保
# モデルの学習
model1.fit(x_train, t_train)
# モデルの検証
print(model1.score(x_train, t_train))
print(model1.score(x_val, t_val))

# 重みの調整

In [None]:
# 空のデータフレームを準備
df_w = pd.DataFrame()

# 重みを取得
coef = model0.coef_
coef

# xの列名を取得
x_col = df2.drop(labels=['result'], axis=1).columns
x_col

# 空のデータフレームに列を追加
df_w['index'] = x_col
df_w['coef'] = coef
df_w.head()

# 降順に並べかえて表示
df_w.sort_values('coef', ascending=False).head(5)
