In [1]:
# Importing Classifier Modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier


import numpy as np
import pandas as pd
import matplotlib.pylab as plt

#import warnings
#warnings.filterwarnings('ignore')



In [2]:
# load data
train = pd.read_csv('./train.csv').iloc[:, 1:]
test = pd.read_csv('./test.csv')

# split data into X and y
dfX = train.iloc[:,:-1]
dfy = train.iloc[:,[-1]]

## 데이터 탐색

In [None]:
#train.tail()
test.tail()

In [None]:
train.info()

In [None]:
train.groupby('Cover_Type').size()

* train datasets 15119개
* test datasets 565891개
* 독립변수 12개
* 종속변수 1개 Cover_type

## 변수 설명

* Elevation - Elevation in meters (높이 /meter 단위)
* Aspect - Aspect in degrees azimuth (방위각)
* Slope - Slope in degrees (기울기 각도 /도 단위)
* Horizontal_Distance_To_Hydrology - Horz Dist to nearest surface water features (수원과의 수평거리)
* Vertical_Distance_To_Hydrology - Vert Dist to nearest surface water features (수원과의 수직거리)
* Horizontal_Distance_To_Roadways - Horz Dist to nearest roadway (길가와의 수평거리)
* Hillshade_9am (0 to 255 index) - Hillshade index at 9am, summer solstice (오전 9시의 차양 / 0~255)
* Hillshade_Noon (0 to 255 index) - Hillshade index at noon, summer solstice (정오시의 차양/ 0~255)
* Hillshade_3pm (0 to 255 index) - Hillshade index at 3pm, summer solstice (오후 9시의 차양/ 0~255)
* Horizontal_Distance_To_Fire_Points - Horz Dist to nearest wildfire ignition points (야생 산불 발화지점과의 수평거리)
* Wilderness_Area (4 binary columns, 0 = absence or 1 = presence) - Wilderness area designation (황야 지대 /4종류 ) in Roosevelt National Forest of northern Colorado
* Soil_Type (40 binary columns, 0 = absence or 1 = presence) - Soil Type designation (토양 종류 / 40종류)
* 토양종류와 황야 지대 카테고리별 설명은 https://www.kaggle.com/c/forest-cover-type-prediction/data 참조

## 종속 변수

* Cover_Type (7 types, integers 1 to 7) - Forest Cover Type designation- (산림 유형 / 7종류) (the predominant kind of tree cover)
* 1 - Spruce/Fir
* 2 - Lodgepole Pine
* 3 - Ponderosa Pine
* 4 - Cottonwood/Willow
* 5 - Aspen
* 6 - Douglas-fir
* 7 - Krummholz

In [None]:
train.iloc[:,:20].describe()

In [None]:
train.iloc[:,20:40].describe()

In [None]:
train.iloc[:,40:55].describe()

## data cleaning

In [3]:
train['Distanse_to_Hydrolody'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
test['Distanse_to_Hydrolody'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5

train['Hydro_Fire_1'] = train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Fire_Points']
test['Hydro_Fire_1'] = test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Fire_Points']

train['Hydro_Fire_2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Fire_Points'])
test['Hydro_Fire_2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Fire_Points'])

train['Hydro_Road_1'] = abs(train['Horizontal_Distance_To_Hydrology']+train['Horizontal_Distance_To_Roadways'])
test['Hydro_Road_1'] = abs(test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Roadways'])

train['Hydro_Road_2'] = abs(train['Horizontal_Distance_To_Hydrology']-train['Horizontal_Distance_To_Roadways'])
test['Hydro_Road_2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Roadways'])

train['Fire_Road_1'] = abs(train['Horizontal_Distance_To_Fire_Points']+train['Horizontal_Distance_To_Roadways'])
test['Fire_Road_1'] = abs(test['Horizontal_Distance_To_Fire_Points']+test['Horizontal_Distance_To_Roadways'])

train['Fire_Road_2'] = abs(train['Horizontal_Distance_To_Fire_Points']-train['Horizontal_Distance_To_Roadways'])
test['Fire_Road_2'] = abs(test['Horizontal_Distance_To_Fire_Points']-test['Horizontal_Distance_To_Roadways'])

In [4]:
# From both train and test data
train = train.drop(['Soil_Type7', 'Soil_Type15'], axis = 1)
test = test.drop(['Soil_Type7', 'Soil_Type15'], axis = 1)

# split data into X and y
dfX = train.iloc[:,:-1]
dfy = train.iloc[:,[-1]]

## scale

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
category_data = train.iloc[:, 10:]
standard_scaler = StandardScaler()
robust_scaled = RobustScaler().fit_transform(train.iloc[:,:10])
standard_scaled = standard_scaler.fit_transform(train.iloc[:,:10])
minmax_scaled = MinMaxScaler().fit_transform(train.iloc[:,:10])

robust_data = pd.concat([pd.DataFrame(robust_scaled, columns = train.columns[:10]), category_data], axis=1)
standard_data = pd.concat([pd.DataFrame(standard_scaled, columns = train.columns[:10]), category_data], axis=1)
minmax_data = pd.concat([pd.DataFrame(minmax_scaled, columns = train.columns[:10]), category_data], axis=1)

scaled_data_list = [robust_data, standard_data, minmax_data]
roX = robust_data.iloc[  : , :-1 ]
roy = robust_data.iloc[  : ,[-1] ]
stanX = standard_data.iloc[  : , :-1 ]
stany = standard_data.iloc[  : ,[-1] ]
minX = minmax_data.iloc[  : , :-1 ]
miny = minmax_data.iloc[  : ,[-1] ]


## Cross Validation (K-fold)

In [7]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

def cv_score(model, train_data, train_target):
    cv = KFold(n_splits=5, shuffle = True, random_state = 42)
    return cross_val_score(model, train_data, train_target, scoring='accuracy', cv= cv, n_jobs=-1)

In [10]:
xgb_clf =  XGBClassifier(n_estimators=500, max_depth = 10, learning_rate=0.03 ,objective='multi:softmax')
gboost_clf = GradientBoostingClassifier(n_estimators=200)
dt_clf = DecisionTreeClassifier(max_depth=10)
extra_clf = ExtraTreesClassifier(n_estimators=200, max_depth=10)
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=10)


In [None]:
print(cv_score(xgb_clf, dfX, dfy))
print(cv_score(gboost_clf, dfX, dfy))
print(cv_score(dt_clf, dfX, dfy))
print(cv_score(extra_clf, dfX, dfy))
print(cv_score(rf_clf, dfX, dfy))

In [None]:
%%time
xgb_clf.fit(X_train, y_train)
#gboost_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
extra_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [None]:
model_names = ['XGB','Gradient','Decision','ExtraTree','RandomForest']
importances = [ xgb_clf.feature_importances_, gboost_clf.feature_importances_, dt_clf.feature_importances_, extra_clf.feature_importances_, rf_clf.feature_importances_]

In [None]:
plt.figure(figsize = (15, 7))

for color, importance in zip(['r','y','b','m','g'], importances):
    plt.plot(range(54), importance, c= color)
    ax = plt.gca()
    ax.set_xticks(range(54))
    ax.set_xticklabels(dfX.columns, fontdict={'fontsize': 9, 'rotation': 'vertical'})
    plt.ylabel('Importance')
plt.legend(model_names, loc='best')
plt.show()

## Voting

In [None]:
s_voting_clf = VotingClassifier(estimators=[('xgb', xgb_clf),('dt',dt_clf),
                                          ('extra', extra_clf),('rf',rf_clf)], voting='soft')

#h_voting_clf = VotingClassifier(estimators=[('xgb', xgb_clf),('gdboost',gboost_clf),('dt',dt_clf),
#                                          ('extra', extra_clf),('rf',rf_clf)], voting='hard')

In [None]:
#print(cv_score(s_voting_clf, dfX, dfy))
#print(cv_score(h_voting_clf, dfX, dfy))

## 데이터 셋 분리

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.3)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

target = ['class1', 'class2','class3','class4','class5','class6','class7' ]

for clf in (xgb_clf, dt_clf, extra_clf, rf_clf, s_voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    print (classification_report(y_test, y_pred, target_names=target))

In [None]:
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(n_jobs=-1,base_estimator=dt_clf, n_estimators=200, random_state=0)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))
print (classification_report(y_test, y_pred, target_names=target))

## 데이터 오버샘플링

In [5]:
def merge(df, count):
    if count >= 10: ###cover type 1,2를 얼마만큼 중복생성할 것인지 결정.### 
        return df
    add_1 = train.loc[train['Cover_Type'] == 1]
    add_2 = train.loc[train['Cover_Type'] == 2]
    new_df = pd.concat([add_1, add_2, df], axis=0)
    return merge(new_df, count+1)

new_train = merge(train, 0)

#데이터 분리
dfX = new_train.iloc[:, :-1]
dfy = new_train.iloc[:, [-1]]

## 오버샘플링

In [None]:
from imblearn.over_sampling import *
from collections import Counter

ros = RandomOverSampler(random_state=42)
#X_res, y_res = ros.fit_sample(dfX, dfy)
X_res, y_res = SMOTE(random_state=0).fit_sample(dfX, dfy)

print('Resampled dataset shape {}'.format(Counter(y_res)))
#X, y = RandomOverSampler(random_state=0).fit_sample(dfX, dfy)
#y_pred = plot_samples(X, y)

## submission

In [None]:
final_clf = VotingClassifier(estimators=[('xgb', xgb_clf),('dt',dt_clf),
                                          ('extra', extra_clf),('rf',rf_clf)], voting='soft')
final_clf.fit(dfX, dfy)

test_data = test.drop("Id", axis=1).copy()
prediction = final_clf.predict(test_data)

In [None]:
submission = pd.DataFrame({
        "Id": test["Id"],
        "Cover_type": prediction
    })

submission.to_csv('submission5.csv', index=False)

In [None]:
submission = pd.read_csv('submission5.csv')
submission.head()

In [None]:
submit_group = submission.groupby('Cover_type')
submit_group.count()