In [1]:
import sklearn
import pandas as pd
import numpy as np
import mglearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from IPython.display import display

import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train_id = train.Id
test_id = test.Id
# Id 삭제 
train.drop('Id', axis = 1, inplace=True)
test.drop('Id', axis = 1, inplace=True)

print('train_data shape', train.shape)
print('test_data shape', test.shape)

train_data shape (15120, 55)
test_data shape (565892, 54)


In [4]:
train.Cover_Type

0        5
1        5
2        2
3        2
4        5
5        2
6        5
7        5
8        5
9        5
10       5
11       2
12       2
13       5
14       5
15       5
16       5
17       5
18       5
19       5
20       5
21       2
22       5
23       5
24       5
25       5
26       5
27       2
28       2
29       5
        ..
15090    3
15091    3
15092    3
15093    6
15094    6
15095    6
15096    6
15097    3
15098    3
15099    6
15100    3
15101    6
15102    3
15103    6
15104    3
15105    6
15106    3
15107    3
15108    3
15109    6
15110    6
15111    6
15112    6
15113    3
15114    3
15115    3
15116    3
15117    3
15118    3
15119    3
Name: Cover_Type, dtype: int64

## 변수 설명
- Elevation - Elevation in meters (높이 /meter 단위)
- Aspect - Aspect in degrees azimuth (방위각)
- Slope - Slope in degrees (기울기 각도 /도 단위)
- Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features (수원과의 수평거리)
- Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features (수원과의 수직거리)
- Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway (길가와의 수평거리)
- Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice (오전 9시의 차양 / 0~255)
- Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice (정오시의 차양/ 0~255)
- Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice (오후 9시의 차양/ 0~255)
- Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points (야생 산불 발화지점과의 수평거리)
- Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation (황야 지대 /4종류 ) in Roosevelt National Forest of northern Colorado
- Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation (토양 종류 / 40종류)

- 토양종류와 황야 지대 카테고리별 설명은 https://www.kaggle.com/c/forest-cover-type-prediction/data 참조


## 종속 변수
- Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation- (산림 유형 / 7종류) (the predominant kind of tree cover)
- 1 - Spruce/Fir
- 2 - Lodgepole Pine
- 3 - Ponderosa Pine
- 4 - Cottonwood/Willow
- 5 - Aspen
- 6 - Douglas-fir
- 7 - Krummholz

In [5]:
# 독립변수와 종속변수 나누기
train_data = train.iloc[:, :-1]
train_target = train.iloc[:, -1]
# train 데이터 갯수 저장
ntrain = len(train)

# train, test데이터 합쳐서 같이 feature engineering
all_data = pd.concat([train_data, test])
print('all_data shape', all_data.shape)

all_data shape (581012, 54)


# type 변수들 관찰
- Soil_type7, 8, 15, 25는 훈련데이터에서 한 카테고리가 1 이하로 나왔으므로 의미가 없다. 

In [6]:
train.loc[:, 'Wilderness_Area1':'Cover_Type'].sum(axis = 0)

Wilderness_Area1     3597
Wilderness_Area2      499
Wilderness_Area3     6349
Wilderness_Area4     4675
Soil_Type1            355
Soil_Type2            623
Soil_Type3            962
Soil_Type4            843
Soil_Type5            165
Soil_Type6            650
Soil_Type7              0
Soil_Type8              1
Soil_Type9             10
Soil_Type10          2142
Soil_Type11           406
Soil_Type12           227
Soil_Type13           476
Soil_Type14           169
Soil_Type15             0
Soil_Type16           114
Soil_Type17           612
Soil_Type18            60
Soil_Type19            46
Soil_Type20           139
Soil_Type21            16
Soil_Type22           345
Soil_Type23           757
Soil_Type24           257
Soil_Type25             1
Soil_Type26            54
Soil_Type27            15
Soil_Type28             9
Soil_Type29          1291
Soil_Type30           725
Soil_Type31           332
Soil_Type32           690
Soil_Type33           616
Soil_Type34            22
Soil_Type35 

In [7]:
drop_columns = ['Soil_Type7', 'Soil_Type8', 'Soil_Type15', 'Soil_Type25']

In [None]:
train.loc[:'Wilderness_Area1']

In [8]:
all_data.drop(drop_columns, axis = 1, inplace = True)

- feature engineering이 끝났으면 다시 test, train 데이터로 나눔

In [9]:
train_data = all_data[:ntrain]
test = all_data[ntrain:]
print(train.shape)
print(test.shape)


(15120, 55)
(565892, 50)


# 모델

In [None]:
from sklearn.cross_validation import cross_val_score, ABCMeta
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.pipeline import make_pipeline
import tensorflow as tf

In [11]:
#test
encoder = OneHotEncoder()
train_target_onehot = encoder.fit_transform(train_target.reshape(-1, 1)).toarray()
train_target.shape

  app.launch_new_instance()


(15120,)

In [12]:
def cv_score(model):
    cv = KFold(n_splits=5, shuffle = True, random_state = 42).get_n_splits(train_data.values)
    return cross_val_score(model, train_data, train_target, scoring='accuracy', cv= cv)

In [13]:
svc = make_pipeline(StandardScaler(), SVC(C=1000, gamma = 0.1))

score = cv_score(svc)
print(score)
print('SVC : {:.4f}({:.4f})'.format(score.mean(), score.std()))

[ 0.71494709  0.72685185  0.70634921  0.72519841  0.8098545 ]
SVC : 0.7366(0.0374)


In [14]:
GBoost = GradientBoostingClassifier(n_estimators=3000, max_depth = 4, learning_rate =0.01, min_samples_leaf=15,
                                   min_samples_split = 10, random_state = 5)

RF = RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_leaf=2, random_state=5, n_jobs=-1)
score = cv_score(GBoost)
print(score)
print('GBoost : {:.4f}({:.4f})'.format(score.mean(), score.std()))

score = cv_score(RF)
print(score)
print('RandomForest : {:.4f}({:.4f})'.format(score.mean(), score.std()))

[ 0.74768519  0.71990741  0.75892857  0.77612434  0.83796296]
GBoost : 0.7681(0.0394)
[ 0.71957672  0.71593915  0.74272487  0.7853836   0.8260582 ]
RandomForest : 0.7579(0.0421)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target, random_state = 42)

print(svc.fit(X_train,y_train).score(X_test, y_test))
print(GBoost.fit(X_train,y_train).score(X_test, y_test))
print(RF.fit(X_train,y_train).score(X_test, y_test))

0.820105820106
0.851587301587
0.842592592593


In [16]:
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target_onehot, random_state = 42)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

nb_classes = 7
X = tf.placeholder(tf.float32, [None, X_train.shape[1]])
Y = tf.placeholder(tf.float32, [None, nb_classes])
keep_prob = tf.placeholder(tf.float32)

W1 = tf.get_variable("WWi111", shape =[X_train.shape[1], 200], initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.random_normal([200]), name = 'bias1')
L1 = tf.nn.relu(tf.matmul(X, W1) + b1)
L1 = tf.nn.dropout(L1, keep_prob=keep_prob)

W2 = tf.get_variable("WWi222", shape =[200, 300], initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.random_normal([300]), name = 'bias2')
L2 = tf.nn.relu(tf.matmul(L1, W2) + b2)
L2 = tf.nn.dropout(L2, keep_prob=keep_prob)

W3 = tf.get_variable("WWi333", shape =[300, 200], initializer=tf.contrib.layers.xavier_initializer())
b3 = tf.Variable(tf.random_normal([200]), name = 'bias3')
L3 = tf.nn.relu(tf.matmul(L2, W3) + b3)
L3 = tf.nn.dropout(L3, keep_prob=keep_prob)

W4 = tf.get_variable("WWi444", shape =[200, nb_classes], initializer=tf.contrib.layers.xavier_initializer())
b4 = tf.Variable(tf.random_normal([nb_classes]), name = 'bias4')
hypothesis = tf.matmul(L3, W4) + b4

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = hypothesis, labels = Y))

optimizer = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(cost)
prediction = tf.armax(hypothesis, 1)
is_correct = tf.equal(prediction, tf.argmax(Y, 1))

accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))


with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for step in range(2000):
        
        c , _ = sess.run([cost, optimizer], feed_dict = {X: X_train, Y: y_train, keep_prob: 0.7})
        if step  % 100 == 0:
            print('step : {}, cost = {}'.format(step, c))
    print('train set : ', sess.run(accuracy, feed_dict = {X: X_train, Y: y_train, keep_prob: 1.0}))
    acc = sess.run(accuracy, feed_dict = {X: X_test, Y: y_test, keep_prob: 1.0})    
    print('test set : ', acc)
    

NameError: name 'scaler' is not defined

In [22]:
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# 그리드 서치
- for xgboost
- 다른 모델도 추후 시도

In [26]:
param_test = {
 'max_depth':[4,5,6, 7],
 'min_child_weight':range(4, 10, 2)
}
cv = KFold(5, shuffle = True, random_state=42)
model_xgb = xgb.XGBClassifier(n_estimators=200, objective='multi:softmax', random_state =7, nthread = -1)
grid = GridSearchCV(model_xgb, param_test, scoring='accuracy', n_jobs=-1, cv = cv)

grid.fit(train_data, train_target)




NameError: name 'gsearch2' is not defined

In [28]:
grid.grid_scores_, grid.best_params_, grid.best_score_




([mean: 0.81012, std: 0.00651, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: 0.81290, std: 0.00758, params: {'max_depth': 4, 'min_child_weight': 6},
  mean: 0.81138, std: 0.00772, params: {'max_depth': 4, 'min_child_weight': 8},
  mean: 0.83280, std: 0.00710, params: {'max_depth': 5, 'min_child_weight': 4},
  mean: 0.83036, std: 0.00866, params: {'max_depth': 5, 'min_child_weight': 6},
  mean: 0.82996, std: 0.00806, params: {'max_depth': 5, 'min_child_weight': 8},
  mean: 0.84405, std: 0.00911, params: {'max_depth': 6, 'min_child_weight': 4},
  mean: 0.84008, std: 0.00701, params: {'max_depth': 6, 'min_child_weight': 6},
  mean: 0.83862, std: 0.00864, params: {'max_depth': 6, 'min_child_weight': 8},
  mean: 0.85126, std: 0.00684, params: {'max_depth': 7, 'min_child_weight': 4},
  mean: 0.85112, std: 0.00687, params: {'max_depth': 7, 'min_child_weight': 6},
  mean: 0.84663, std: 0.00982, params: {'max_depth': 7, 'min_child_weight': 8}],
 {'max_depth': 7, 'min_child_weight': 4