In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

feature_name_df = pd.read_csv('./UCI HAR Dataset/features.txt', sep='\s+',
                              header=None, names=['column_index', 'column_name'])
print(feature_name_df.head())

   column_index        column_name
0             1  tBodyAcc-mean()-X
1             2  tBodyAcc-mean()-Y
2             3  tBodyAcc-mean()-Z
3             4   tBodyAcc-std()-X
4             5   tBodyAcc-std()-Y


In [2]:
feature_name = feature_name_df.iloc[:, 1].values.tolist()
print(feature_name[:10])

['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X']


In [3]:
feature_dup_df = feature_name_df.groupby('column_name').count()
print(feature_dup_df[feature_dup_df['column_index'] > 1 ].count())

column_index    42
dtype: int64


In [4]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), 
                                  columns = ['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

def get_human_dataset():
    feature_name_df = pd.read_csv('./UCI HAR Dataset/features.txt', sep='\s+',
                              header=None, names=['column_index', 'column_name'])
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv('./UCI HAR Dataset/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./UCI HAR Dataset/test/X_test.txt', sep='\s+', names=feature_name)
    
    y_train = pd.read_csv('./UCI HAR Dataset/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('./UCI HAR Dataset/test/y_test.txt', sep='\s+', header=None, names=['action'])
    
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = get_human_dataset()
print('학습 피처 데이터셋 정보')
print(X_train.info())

학습 피처 데이터셋 정보
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, tBodyAcc-mean()-X to angle(Z,gravityMean)
dtypes: float64(561)
memory usage: 31.5 MB
None


In [6]:
print(y_train['action'].value_counts())

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64


## RandomForest

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(rf_pred)
print('랜덤 포레스트 정확도: {:.4f}'.format(rf_accuracy))

[5 5 5 ... 2 2 2]
랜덤 포레스트 정확도: 0.9253


In [8]:
rf_clf.predict_proba(X_test)

array([[0.  , 0.  , 0.  , 0.1 , 0.89, 0.01],
       [0.  , 0.  , 0.  , 0.15, 0.85, 0.  ],
       [0.  , 0.  , 0.  , 0.08, 0.92, 0.  ],
       ...,
       [0.19, 0.73, 0.08, 0.  , 0.  , 0.  ],
       [0.36, 0.54, 0.1 , 0.  , 0.  , 0.  ],
       [0.43, 0.44, 0.13, 0.  , 0.  , 0.  ]])

## AdaBoost

In [9]:
from sklearn.ensemble import AdaBoostClassifier
import time

start_time = time.time()
ab_clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.01, random_state=0)
ab_clf.fit(X_train, y_train)
ab_pred = ab_clf.predict(X_test)
ab_accuracy = accuracy_score(y_test, ab_pred)

print('에이다부스트 정확도: {:.4f}'.format(ab_accuracy))
print('에이다부스트 수행시간: {:.1f} 초'.format(time.time() - start_time))

에이다부스트 정확도: 0.7234
에이다부스트 수행시간: 35.6 초


In [10]:
ab_clf.predict_proba(X_test)

array([[5.45881046e-09, 1.78909812e-06, 2.42621209e-09, 3.73303942e-01,
        4.04795874e-01, 2.21898387e-01],
       [5.45881046e-09, 1.78909812e-06, 2.42621209e-09, 3.73303942e-01,
        4.04795874e-01, 2.21898387e-01],
       [8.84563631e-09, 1.14034127e-06, 3.17287659e-09, 3.77538849e-01,
        3.72485584e-01, 2.49974415e-01],
       ...,
       [4.05642676e-01, 3.43655925e-01, 2.45657315e-01, 5.20985789e-07,
        1.86499032e-09, 5.04356104e-03],
       [4.05642676e-01, 3.43655925e-01, 2.45657315e-01, 5.20985789e-07,
        1.86499032e-09, 5.04356104e-03],
       [4.05642676e-01, 3.43655925e-01, 2.45657315e-01, 5.20985789e-07,
        1.86499032e-09, 5.04356104e-03]])

## logistic regressor

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

print('로지스틱 회귀 정확도: {:.4f}'.format(lr_acc))

로지스틱 회귀 정확도: 0.9586


In [12]:
lr.predict_proba(X_test)

array([[2.48042103e-08, 2.00975109e-08, 5.19835663e-09, 8.76441196e-05,
        9.99856544e-01, 5.57618556e-05],
       [2.95201722e-08, 2.45473674e-09, 1.42412778e-11, 6.13820127e-02,
        9.38361941e-01, 2.56013955e-04],
       [3.79294431e-09, 6.93463370e-12, 6.14480616e-12, 3.93228724e-02,
        9.60555196e-01, 1.21927867e-04],
       ...,
       [1.80662485e-03, 9.96918277e-01, 1.27287565e-03, 1.00258552e-07,
        2.10703401e-06, 1.48141803e-08],
       [1.05395103e-02, 9.86998666e-01, 2.45114919e-03, 6.35020981e-06,
        4.19440573e-06, 1.30325738e-07],
       [9.51361344e-02, 8.98298943e-01, 6.53102353e-03, 4.71699150e-06,
        2.88473716e-05, 3.34567788e-07]])

## Ensemble

In [13]:
class EnsembleClassifier:
    def __init__(self, models, voting='soft', weights=None):
        
        self.models = models
        self.voting = voting
        self.weights = weights
        
    def predict(self, X_test):
        rf_clf = self.models['rf']
        ab_clf = self.models['ab']
        lr = self.models['lr']
        
        rf_pred = rf_clf.predict_proba(X_test)
        ab_pred = ab_clf.predict_proba(X_test)
        lr_pred = lr.predict_proba(X_test)
        
        preds = np.array([rf_pred, ab_pred, lr_pred])
        
        if self.voting == 'soft':
            prediction = np.average(preds, axis=0, weights=self.weights)
        elif self.voting == 'hard':
            pass

        return prediction

In [14]:
models = {'rf' : rf_clf, 'ab' : ab_clf, 'lr' : lr}

ensemble = EnsembleClassifier(models, voting='soft', weights=[0.25, 0.28, 0.47])
preds = ensemble.predict(X_test)

In [15]:
accuracy = accuracy_score(y_test, preds.argmax(axis=1)+1)
print('앙상블 모델 정확도: {:.4f}'.format(accuracy))

앙상블 모델 정확도: 0.9593


In [16]:
import torch

num = 1000
w = torch.softmax(torch.rand(num, 3), axis=1)

acc = []
for epoch in range(num):
    weights = list(w[epoch].numpy())
    ensemble = EnsembleClassifier(models, voting='soft', weights=weights)
    preds = ensemble.predict(X_test)
    accuracy = accuracy_score(y_test, preds.argmax(axis=1)+1)
    acc.append(accuracy)
    
    if epoch % 100 == 0:
        print('{:.2f}, {:.2f}, {:.2f}'.format(weights[0], weights[1], weights[2]))
        print('{} 앙상블 모델 정확도: {:.4f}'.format(epoch, accuracy))
        print()

0.41, 0.35, 0.24
0 앙상블 모델 정확도: 0.9505

0.42, 0.28, 0.30
100 앙상블 모델 정확도: 0.9532

0.32, 0.47, 0.21
200 앙상블 모델 정확도: 0.9501

0.35, 0.33, 0.32
300 앙상블 모델 정확도: 0.9539

0.25, 0.41, 0.34
400 앙상블 모델 정확도: 0.9569

0.36, 0.24, 0.40
500 앙상블 모델 정확도: 0.9569

0.25, 0.22, 0.53
600 앙상블 모델 정확도: 0.9589

0.42, 0.25, 0.33
700 앙상블 모델 정확도: 0.9542

0.31, 0.27, 0.41
800 앙상블 모델 정확도: 0.9576

0.21, 0.40, 0.39
900 앙상블 모델 정확도: 0.9583



In [17]:
result = pd.DataFrame(w.numpy(), columns=['w1', 'w2', 'w3'])
result['accuracy'] = acc
result.sort_values('accuracy', ascending=False)

Unnamed: 0,w1,w2,w3,accuracy
610,0.258671,0.259550,0.481779,0.959281
749,0.249549,0.252133,0.498318,0.959281
207,0.231672,0.262851,0.505477,0.959281
442,0.241792,0.286375,0.471832,0.959281
795,0.208162,0.321381,0.470458,0.959281
...,...,...,...,...
769,0.522354,0.270257,0.207388,0.947743
285,0.474847,0.335260,0.189892,0.947404
75,0.447734,0.353069,0.199196,0.947404
867,0.449196,0.351863,0.198941,0.947404
