In [1]:
import urllib
import zipfile
import os

DOWNLOAD_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
DATASET_PATH = "dataset"

def fetch_human_activity_data(download_url = DOWNLOAD_ROOT, dataset_path = DATASET_PATH):
    os.makedirs(dataset_path, exist_ok = True)
    zip_path = os.path.join(dataset_path, "UCI_HAR_Dataset.zip")
    urllib.request.urlretrieve(download_url, zip_path)
    data_zip_file = zipfile.ZipFile(zip_path)
    extracted = data_zip_file.namelist()
    data_zip_file.extractall(dataset_path)
    extracted_path = os.path.join(dataset_path, extracted[0])
    data_zip_file.close()
    return extracted_path
    

In [2]:
extracted_path = fetch_human_activity_data()

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
feature_name_df = pd.read_csv(extracted_path+'features.txt', sep = '\s+', header = None, names = ['column_index', 'column_name'])

feature_name = feature_name_df.iloc[:, 1].values.tolist()
feature_name

['tBodyAcc-mean()-X',
 'tBodyAcc-mean()-Y',
 'tBodyAcc-mean()-Z',
 'tBodyAcc-std()-X',
 'tBodyAcc-std()-Y',
 'tBodyAcc-std()-Z',
 'tBodyAcc-mad()-X',
 'tBodyAcc-mad()-Y',
 'tBodyAcc-mad()-Z',
 'tBodyAcc-max()-X',
 'tBodyAcc-max()-Y',
 'tBodyAcc-max()-Z',
 'tBodyAcc-min()-X',
 'tBodyAcc-min()-Y',
 'tBodyAcc-min()-Z',
 'tBodyAcc-sma()',
 'tBodyAcc-energy()-X',
 'tBodyAcc-energy()-Y',
 'tBodyAcc-energy()-Z',
 'tBodyAcc-iqr()-X',
 'tBodyAcc-iqr()-Y',
 'tBodyAcc-iqr()-Z',
 'tBodyAcc-entropy()-X',
 'tBodyAcc-entropy()-Y',
 'tBodyAcc-entropy()-Z',
 'tBodyAcc-arCoeff()-X,1',
 'tBodyAcc-arCoeff()-X,2',
 'tBodyAcc-arCoeff()-X,3',
 'tBodyAcc-arCoeff()-X,4',
 'tBodyAcc-arCoeff()-Y,1',
 'tBodyAcc-arCoeff()-Y,2',
 'tBodyAcc-arCoeff()-Y,3',
 'tBodyAcc-arCoeff()-Y,4',
 'tBodyAcc-arCoeff()-Z,1',
 'tBodyAcc-arCoeff()-Z,2',
 'tBodyAcc-arCoeff()-Z,3',
 'tBodyAcc-arCoeff()-Z,4',
 'tBodyAcc-correlation()-X,Y',
 'tBodyAcc-correlation()-X,Z',
 'tBodyAcc-correlation()-Y,Z',
 'tGravityAcc-mean()-X',
 'tGravityA

In [5]:
feature_duplicate_df = feature_name_df.groupby('column_name').count()
print(feature_duplicate_df[feature_duplicate_df['column_index'] > 1].count())

column_index    42
dtype: int64


In [6]:
def get_new_feature_name_df(old_feature_name_df):
    feature_duplicate_df = pd.DataFrame(data = old_feature_name_df.groupby('column_name').cumcount(), columns = ['duplicate_count'])
    
    feature_duplicate_df = feature_duplicate_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_duplicate_df, how = 'outer')
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'duplicate_count']].apply(lambda x: x[0] + '_' + str(x[1]) if x[1] > 0 else x[0], axis = 1)
    new_feature_name_df = new_feature_name_df.drop(['index'], axis = 1)
    return new_feature_name_df

In [7]:
def get_har_dataset():
    feature_name_df = pd.read_csv(extracted_path+'features.txt', sep = '\s+', header = None, names = ['column_index', 'column_name'])
    
    new_feature_name_df = get_new_feature_name_df(feature_name_df)
    features_name = new_feature_name_df.iloc[:, 1].values.tolist()
    
    X_train = pd.read_csv(extracted_path+'train/X_train.txt', sep = '\s+', names = features_name)
    X_test = pd.read_csv(extracted_path+'test/X_test.txt', sep = '\s+', names = features_name)
    
    
    y_train = pd.read_csv(extracted_path+'train/y_train.txt', sep = '\s+', header = None, names = ['action'])
    y_test = pd.read_csv(extracted_path+'test/y_test.txt', sep = '\s+', header = None, names = ['action'])
    
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = get_har_dataset()

In [9]:
print('Training Dataset Info')
print(X_train.info())

Training Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7352 entries, 0 to 7351
Columns: 561 entries, tBodyAcc-mean()-X to angle(Z,gravityMean)
dtypes: float64(561)
memory usage: 31.5 MB
None


In [10]:
print(y_train['action'].value_counts())

6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: action, dtype: int64


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_clf = DecisionTreeClassifier(random_state = 156)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('Decision Tree Prediction Accuracy: {0:.4f}'.format(accuracy))

print('DecisionTreeClassifier Basic Parameter :\n', dt_clf.get_params())

Decision Tree Prediction Accuracy: 0.8548
DecisionTreeClassifier Basic Parameter :
 {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': 'deprecated', 'random_state': 156, 'splitter': 'best'}


In [16]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [6, 8, 10, 12, 16, 20, 24]
}

for depth in max_depths:
    dt_clf = DecisionTreeClassifier(max_depth = depth, random_state = 156)
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_train)
    accuracy = accuracy_score(y_test, pred)
    print('max_depth = {0}  Accuracy: {1:.4f}'.format(depth, accuracy))

NameError: name 'max_depths' is not defined