In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

feature_name_df = pd.read_csv('./UCI HAR Dataset/features.txt', sep='\s+',
                              header=None, names=['column_index', 'column_name'])
print(feature_name_df.head())

   column_index        column_name
0             1  tBodyAcc-mean()-X
1             2  tBodyAcc-mean()-Y
2             3  tBodyAcc-mean()-Z
3             4   tBodyAcc-std()-X
4             5   tBodyAcc-std()-Y


In [2]:
feature_name = feature_name_df.iloc[:, 1].values.tolist()
print(feature_name[:10])

['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X']


In [3]:
feature_dup_df = feature_name_df.groupby('column_name').count()
print(feature_dup_df[feature_dup_df['column_index'] > 1 ].count())

column_index    42
dtype: int64


In [4]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(), 
                                  columns = ['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) if x[1] > 0 else x[0], axis=1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

def get_human_dataset():
    feature_name_df = pd.read_csv('./UCI HAR Dataset/features.txt', sep='\s+',
                              header=None, names=['column_index', 'column_name'])
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    
    feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv('./UCI HAR Dataset/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('./UCI HAR Dataset/test/X_test.txt', sep='\s+', names=feature_name)
    
    y_train = pd.read_csv('./UCI HAR Dataset/train/y_train.txt', sep='\s+', header=None, names=['action'])
    y_test = pd.read_csv('./UCI HAR Dataset/test/y_test.txt', sep='\s+', header=None, names=['action'])
    
    return X_train, X_test, y_train, y_test

In [5]:
X_train, X_test, y_train, y_test = get_human_dataset()
print('학습 피처 데이터셋 정보')
print(X_train.info())

학습 피처 데이터셋 정보
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, tBodyAcc-mean()-X to angle(Z,gravityMean)
dtypes: float64(561)
memory usage: 31.5 MB
None


In [6]:
print(y_train['action'].value_counts())

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64


## RandomForest

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

rf_clf = RandomForestClassifier(n_estimators=400, max_depth=15, random_state=0)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(rf_pred)
print('랜덤 포레스트 정확도: {:.4f}'.format(rf_accuracy))

[5 5 5 ... 2 2 1]
랜덤 포레스트 정확도: 0.9247


In [8]:
rf_clf.predict_proba(X_test)

array([[2.50803859e-03, 0.00000000e+00, 0.00000000e+00, 8.20914828e-02,
        9.07893251e-01, 7.50722795e-03],
       [8.03858521e-06, 0.00000000e+00, 0.00000000e+00, 1.22813160e-01,
        8.72153717e-01, 5.02508509e-03],
       [8.03858521e-06, 1.08225108e-05, 0.00000000e+00, 1.39426569e-01,
        8.60547342e-01, 7.22794751e-06],
       ...,
       [2.46951287e-01, 6.65084016e-01, 8.79646968e-02, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [3.90440489e-01, 5.08666661e-01, 9.83928496e-02, 0.00000000e+00,
        2.50000000e-03, 0.00000000e+00],
       [4.69813872e-01, 3.84188168e-01, 1.43497961e-01, 0.00000000e+00,
        0.00000000e+00, 2.50000000e-03]])

## KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
print(knn_pred)
print('KNN 정확도: {:.4f}'.format(knn_acc))

[5 5 5 ... 2 2 1]
KNN 정확도: 0.9016


In [10]:
knn_clf.predict_proba(X_test)

array([[0. , 0. , 0. , 0. , 1. , 0. ],
       [0. , 0. , 0. , 0.2, 0.8, 0. ],
       [0. , 0. , 0. , 0. , 1. , 0. ],
       ...,
       [0. , 1. , 0. , 0. , 0. , 0. ],
       [0. , 0.8, 0.2, 0. , 0. , 0. ],
       [0.8, 0.2, 0. , 0. , 0. , 0. ]])

## LDA

In [11]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, y_train)
lda_pred = lda_clf.predict(X_test)
lda_acc = accuracy_score(y_test, lda_pred)
print(lda_pred)
print('NB 정확도 : {:.4f}'.format(lda_acc))

[5 5 5 ... 2 2 2]
NB 정확도 : 0.9623


In [12]:
lda_clf.predict_proba(X_test)

array([[1.28498192e-160, 7.13634312e-159, 1.59893504e-178,
        1.69164738e-009, 9.99999998e-001, 3.32984584e-081],
       [4.82936289e-140, 2.25726792e-138, 8.31180827e-167,
        2.80294898e-007, 9.99999720e-001, 5.13668964e-090],
       [2.07804300e-149, 1.46045969e-148, 9.82771576e-173,
        4.68842692e-005, 9.99953116e-001, 2.02198310e-083],
       ...,
       [8.00712792e-015, 1.00000000e+000, 6.94058854e-027,
        3.15951200e-135, 1.08534188e-135, 3.39426433e-218],
       [1.99660406e-016, 1.00000000e+000, 1.19066188e-025,
        1.71221649e-124, 5.78733725e-126, 1.32932425e-207],
       [3.72453106e-016, 1.00000000e+000, 3.57974786e-028,
        4.40150515e-143, 9.62282449e-142, 1.22096449e-223]])

## logistic regressor

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty = 'l1', solver = 'saga', random_state=0)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

print('로지스틱 회귀 정확도: {:.4f}'.format(lr_acc))

로지스틱 회귀 정확도: 0.9596


In [14]:
lr.predict_proba(X_test)

array([[3.85277300e-06, 3.56057794e-05, 5.18868288e-05, 5.21082570e-04,
        9.99323585e-01, 6.39871270e-05],
       [6.97524926e-06, 4.65953757e-05, 4.06029643e-06, 6.03471335e-02,
        9.39512033e-01, 8.32028367e-05],
       [7.96386528e-06, 1.48084279e-06, 3.09580451e-06, 3.08428831e-02,
        9.69092069e-01, 5.25074462e-05],
       ...,
       [3.56123172e-03, 9.91928935e-01, 3.05871422e-03, 2.95998875e-04,
        1.09468652e-03, 6.04337596e-05],
       [1.09155795e-02, 9.79583450e-01, 5.45816194e-03, 2.82483998e-03,
        1.07107032e-03, 1.46898283e-04],
       [8.88225852e-02, 8.88970167e-01, 1.39521794e-02, 2.45691001e-03,
        5.46429169e-03, 3.33866919e-04]])

## Ensemble

In [15]:
import torch

class EnsembleClassifier:
    def __init__(self, models, voting='soft', weights=None):
        
        self.models = models
        self.voting = voting
        self.weights = weights
        
        self.pred_list = []
        
    def predict(self, X_test): 
        for model in models.values():
            self.pred_list.append(model.predict_proba(X_test))
        preds = np.array(self.pred_list)
            
        if self.voting == 'soft':
            prediction = np.average(preds, axis=0, weights=self.weights)
        elif self.voting == 'hard':
            pass

        return prediction
    
    def random_search(self, X_test, y_test, num_search):
        weights = torch.softmax(torch.rand(num_search, len(self.models)), axis=1).numpy()
        
        for model in models.values():
            self.pred_list.append(model.predict_proba(X_test))
        preds = np.array(self.pred_list)

        acc = []
        for i in range(num_search):
            prediction = np.average(preds, axis=0, weights=weights[i])
            accuracy = accuracy_score(y_test, prediction.argmax(axis=1)+1)
            acc.append(accuracy)

            if i % 100 == 0:
                print('{:.2f}, {:.2f}, {:.2f}'.format(weights[i,0], weights[i,1], weights[i,2]))
                print('{}번 앙상블 모델 정확도: {:.4f}'.format(i, accuracy))
                print()
        
        result = pd.DataFrame(weights, columns=['w1', 'w2', 'w3', 'w4'])
        result['accuracy'] = acc

        return result.sort_values('accuracy', ascending=False)

In [17]:
models = {'rf' : rf_clf, 'knn':knn_clf, 'lda': lda_clf,'lr' : lr}

ensemble = EnsembleClassifier(models, voting='soft', weights=None)
result = ensemble.random_search(X_test, y_test, 2000)
result.head(10)

0.25, 0.15, 0.33
0번 앙상블 모델 정확도: 0.9684

0.29, 0.16, 0.28
100번 앙상블 모델 정확도: 0.9664

0.21, 0.30, 0.22
200번 앙상블 모델 정확도: 0.9650

0.38, 0.18, 0.28
300번 앙상블 모델 정확도: 0.9657

0.18, 0.44, 0.19
400번 앙상블 모델 정확도: 0.9566

0.34, 0.20, 0.23
500번 앙상블 모델 정확도: 0.9657

0.35, 0.21, 0.22
600번 앙상블 모델 정확도: 0.9657

0.28, 0.16, 0.34
700번 앙상블 모델 정확도: 0.9674

0.19, 0.33, 0.14
800번 앙상블 모델 정확도: 0.9640

0.24, 0.31, 0.26
900번 앙상블 모델 정확도: 0.9657

0.23, 0.31, 0.24
1000번 앙상블 모델 정확도: 0.9647

0.17, 0.31, 0.18
1100번 앙상블 모델 정확도: 0.9647

0.27, 0.17, 0.19
1200번 앙상블 모델 정확도: 0.9667

0.25, 0.15, 0.28
1300번 앙상블 모델 정확도: 0.9674

0.34, 0.17, 0.31
1400번 앙상블 모델 정확도: 0.9667

0.25, 0.26, 0.29
1500번 앙상블 모델 정확도: 0.9678

0.31, 0.22, 0.23
1600번 앙상블 모델 정확도: 0.9657

0.30, 0.32, 0.13
1700번 앙상블 모델 정확도: 0.9610

0.22, 0.24, 0.34
1800번 앙상블 모델 정확도: 0.9674

0.15, 0.31, 0.22
1900번 앙상블 모델 정확도: 0.9664



Unnamed: 0,w1,w2,w3,w4,accuracy
286,0.174314,0.18981,0.36109,0.274786,0.969121
1374,0.16935,0.220571,0.367188,0.242891,0.969121
572,0.166897,0.196536,0.360871,0.275696,0.969121
1699,0.162605,0.208052,0.363127,0.266215,0.969121
1196,0.192606,0.208997,0.356288,0.242109,0.969121
931,0.170444,0.18042,0.357716,0.29142,0.969121
1956,0.202142,0.20622,0.348841,0.242797,0.969121
1606,0.153234,0.192522,0.348692,0.305552,0.968782
780,0.1795,0.19834,0.343541,0.278619,0.968782
1657,0.152675,0.190788,0.303739,0.352798,0.968782


In [18]:
ensemble = EnsembleClassifier(models, voting='soft', weights=[0.17, 0.19, 0.36, 0.28])
preds = ensemble.predict(X_test)
print(preds)

[[4.27445336e-04 9.96961822e-06 1.45283121e-05 1.41014558e-02
  9.84152456e-01 1.29414515e-03]
 [3.31962928e-06 1.30467052e-05 1.13688300e-06 7.57755354e-02
  9.23329400e-01 8.77561260e-04]
 [3.59644176e-06 2.25446282e-06 8.66825263e-07 3.23554023e-02
  9.67621949e-01 1.59308360e-05]
 ...
 [4.29788637e-02 9.40804385e-01 1.58104384e-02 8.28796849e-05
  3.06512227e-04 1.69214527e-05]
 [6.94312454e-02 8.72756698e-01 5.62550698e-02 7.90955196e-04
  7.24899689e-04 4.11315193e-05]
 [2.56738682e-01 7.12223635e-01 2.83012635e-02 6.87934803e-04
  1.53000167e-03 5.18482737e-04]]


In [19]:
accuracy = accuracy_score(y_test, preds.argmax(axis=1)+1)
print('앙상블 모델 정확도: {:.4f}'.format(accuracy))

앙상블 모델 정확도: 0.9691
