In [32]:
import pandas  as pd
import numpy   as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree        import DecisionTreeClassifier
from sklearn.ensemble    import RandomForestClassifier
from sklearn.svm         import SVC 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [55]:
""" READING THE DATA TRAINING DATA """

columns_names = [        'activityrecognition#1',
                         'android.sensor.accelerometer#mean',
                         'android.sensor.game_rotation_vector#mean',
                         'android.sensor.gravity#mean', 
                         'android.sensor.gyroscope#mean',
                         'android.sensor.gyroscope_uncalibrated#mean',
                         'android.sensor.linear_acceleration#mean',
                         'android.sensor.magnetic_field#mean',
                         'android.sensor.magnetic_field_uncalibrated#mean',
                         'android.sensor.orientation#mean',
                         'android.sensor.rotation_vector#mean', 
                         'sound#mean', 
                         'speed#mean',
                         'target'
                ]

dt_train       = pd.read_csv('train.csv')
dt_validate    = pd.read_csv('validation.csv')
dt_test        = pd.read_csv('test.csv')

data_train     = dt_train[columns_names]
data_validate  = dt_validate[columns_names]
data_test      = dt_test[columns_names]

X_train        = data_train.drop(['target'], axis=1)
y_train        = data_train['target']

X_validate     = data_validate.drop(['target'], axis=1)
y_validate     = data_validate['target']

X_test         = data_test.drop(['target'], axis=1)
y_test         = data_test['target']


# FIVE STEPS

In [None]:
"""
Step 1. Fit the Model with the TRAIN DATA 
Step 2. Find the BEST hyperparameteres with the VALIDATION DATA
Step 3. Test the model with the TEST DATA
Step 4. Fit the model again with the hyperparameters settings included with a combined Train and Validation Data
Step 5. Test the new model with the TEST data
"""

In [56]:
""" STEP 1 """


RandomForestClassifier()

In [79]:
""" STEP 1 and 2 """

best_score = 0

for n_estimator in [ 500, 400, 800, 200, 100]:

     """Step 1"""
     model_forest   = RandomForestClassifier(n_estimators=n_estimator)
     model_forest.fit(X_train, y_train)

     """Step 2"""
     score = model_forest.score(X_validate, y_validate)
     if score > best_score:
          best_score = score
          best_parameters = {'n_estimators' : n_estimator}

print('Best Validation Parameters : ', best_parameters)
print('Best Validation Score      : ', best_score)

""" Step 3"""
model_forest_paraSetting = RandomForestClassifier(n_estimators=best_parameters['n_estimators'])
model_forest_paraSetting.fit(X_train, y_train)
score_paraSetting = model_forest_paraSetting.score(X_test, y_test)
print('Best Test  Score           : ', score_paraSetting)



Best Validation Score      :  0.6342359361880773
Best Validation Parameters :  {'n_estimators': 400}
Best Test     Score      :  0.5469384505135759


0.555935982164185

0.5570507205987738

In [40]:
model_tree     = DecisionTreeClassifier()
model_forest   = RandomForestClassifier()

# val_score_tree = cross_val_score(model_tree, X_train, y_train)
# val_score_forest = cross_val_score(model_forest, X_train, y_train)

cross_val_tree = cross_validate(model_tree, X_train, y_train, return_train_score=True)
cross_val_forest = cross_validate(model_forest, X_train, y_train, return_train_score=True)


# cross_val_tree
cross_val_tree, cross_val_forest 

({'fit_time': array([0.56082368, 0.30614996, 0.41547418, 0.32618141, 0.51728725]),
  'score_time': array([0.        , 0.00320673, 0.        , 0.        , 0.0039506 ]),
  'test_score': array([0.81026304, 0.89792984, 0.86500863, 0.90914319, 0.97929845]),
  'train_score': array([1., 1., 1., 1., 1.])},
 {'fit_time': array([7.46812606, 9.32432199, 8.27504444, 9.05557489, 8.24660873]),
  'score_time': array([0.08271742, 0.14042664, 0.15616488, 0.13512206, 0.14576507]),
  'test_score': array([0.83038666, 0.89807361, 0.93257619, 0.956728  , 0.99511213]),
  'train_score': array([1., 1., 1., 1., 1.])})

In [37]:
model_tree     = DecisionTreeClassifier()
model_forest   = RandomForestClassifier()
# model_SVC           = SVC()

model_tree.fit(X_train, y_train)
model_forest.fit(X_train, y_train)
# model_SVC.fit(X_train, y_train)

accuracy_tree_validate   = round(model_tree.score(X_validate, y_validate) * 100, 1)
accuracy_forest_validate = round(model_forest.score(X_validate, y_validate) * 100, 1)
# accuracy_SVC_3         = round(model_SVC_3.score(X_validate, y_validate) * 100, 1)

accuracy_tree_test   = round(model_tree.score(X_test, y_test) * 100, 1)
accuracy_forest_test = round(model_forest.score(X_test, y_test) * 100, 1)
# accuracy_SVC_3     = round(model_SVC_3.score(X_validate, y_validate) * 100, 1)




In [None]:
"""
Splitting Data for Training, Validation, and Testing

Once the data for building the model has been procured, it needs to be split into data for training, parameter tuning, and go-live testing. 

Conceptually, the available data is to be used for three distinct purposes. 

The first purpose is to train the model—that is, the model will try to fit this data. 

The second purpose is to determine whether the model is overfitting the data; this dataset is called the validation set. This data will not be used 
for training but will drive the decision-making on hyperparameter tuning, regularization techniques, etc. (We will discuss these topics in greater 
detail later in this chapter.) 

The third purpose of the data is to determine whether the model is really good enough to take to production/go-live (referred to as the test set).

"""

In [39]:
print("Validation Accuracy Tree : ",    accuracy_tree_validate)
print("Validation Accuracy Forest : ",  accuracy_forest_validate)
# print(accuracy_SVC_3)


print("Test Accuracy Tree : ",     accuracy_tree_test)
print("Test Accuracy Forest : ",   accuracy_forest_test)
# print(accuracy_SVC_3)

# 48.6
# 63.1

Validation Accuracy Tree :  48.5
Validation Accuracy Forest :  63.2
Test Accuracy Tree :  54.2
Test Accuracy Forest :  46.7


In [6]:
""" ACCURACY WITH DEFAULT VALUES """

# accuracy_decision_tree_3 = round(model_decision_tree_3.score(X_validate, y_validate) * 100, 1)
# accuracy_random_forest_3 = round(model_random_forest_3.score(X_validate, y_validate) * 100, 1)
# accuracy_SVC_3           = round(model_SVC_3.score(X_validate, y_validate) * 100, 1)

# print(accuracy_decision_tree_3)
# print(accuracy_random_forest_3)
# print(accuracy_SVC_3)

# # 47.9
# # 58.4
# # 38.7

' ACCURACY WITH DEFAULT VALUES '

In [7]:
# param_DecisionTree  = {'max_depth'   : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]}
# param_RandomForest  = {'n_estimators': [0.1, 1, 10, 100, 100, 1000],    'max_depth'     : [3, 4, 5, 6, 7, 8, 9]}
# param_SVC           = {'kernel'      : ['linear', 'rbf'],                            'C'           : [0.01, 0.1, 1, 10, 100, 1000]} 

# grid_search_DecisionTree = GridSearchCV(DecisionTreeClassifier(), param_DecisionTree, cv=5)
# grid_search_RandomForest = GridSearchCV(RandomForestClassifier(), param_RandomForest, cv=5)
# grid_search_SVC          = GridSearchCV(SVC(), param_SVC, cv=5)

# grid_search_DecisionTree.fit(X_validate, y_validate)
# grid_search_RandomForest.fit(X_validate, y_validate)
# grid_search_SVC.fit(X_validate, y_validate)

# print(f'Decision Tree    =>    Best Score : {round(grid_search_DecisionTree.best_score_ * 100, 2)}    Best Parameters :   {grid_search_DecisionTree.best_params_}  ')
# print(f'Random Forest    =>    Best Score : {round(grid_search_RandomForest.best_score_ * 100, 2)}    Best Parameters :   {grid_search_RandomForest.best_params_}  ')
# print(f'SVC              =>    Best Score : {round(grid_search_SVC.best_score_ * 100, 2)}    Best Parameters :   {grid_search_SVC.best_params_}  ')


In [12]:
# max_depth = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
# acc_train = []
# acc_validate = []


# for depth in max_depth:


#      model_decision_tree = DecisionTreeClassifier(max_depth=depth)


#      model_decision_tree.fit(X_train, y_train)

#      acc_tree_train      = round(model_decision_tree.score(X_train, y_train) * 100, 1)
#      acc_tree_validate   = round(model_decision_tree.score(X_validate, y_validate) * 100, 1)

#      acc_train.append(acc_tree_train)
#      acc_validate.append(acc_tree_validate)

# print(acc_train)
# print(acc_validate)
     

[71.5, 82.0, 86.8, 90.9, 93.5, 96.2, 97.9, 98.8, 99.4, 99.7, 99.8, 99.9]
[20.6, 32.2, 49.0, 49.0, 62.4, 49.6, 45.8, 49.4, 50.2, 46.4, 46.5, 47.2]


# THREE FEATURES

In [81]:
columns_features  =  ['android.sensor.magnetic_field#mean', 'android.sensor.gyroscope#mean',  'speed#mean']

dt_train       = pd.read_csv('train.csv')
dt_validate    = pd.read_csv('validation.csv')
dt_test        = pd.read_csv('test.csv')

data_train     = dt_train[columns_names]
data_validate  = dt_validate[columns_names]
data_test      = dt_test[columns_names]

X_train        = data_train[columns_features]
y_train        = data_train['target']

X_validate     = data_validate[columns_features]
y_validate     = data_validate['target']

X_test         = data_test[columns_features]
y_test         = data_test['target']

""" STEP 1 and 2 """

best_score = 0

for n_estimator in [ 500, 400, 800, 200, 100]:

     """Step 1"""
     model_forest   = RandomForestClassifier(n_estimators=n_estimator)
     model_forest.fit(X_train, y_train)

     """Step 2"""
     score = model_forest.score(X_validate, y_validate)
     if score > best_score:
          best_score = score
          best_parameters = {'n_estimators' : n_estimator}

print('Best Validation Parameters : ', best_parameters)
print('Best Validation Score      : ', best_score)

""" Step 3"""
model_forest_paraSetting = RandomForestClassifier(n_estimators=best_parameters['n_estimators'])
model_forest_paraSetting.fit(X_train, y_train)
score_paraSetting = model_forest_paraSetting.score(X_test, y_test)
print('Best Test  Score           : ', score_paraSetting)


Best Validation Parameters :  {'n_estimators': 100}
Best Validation Score      :  0.5066120906801007
Best Test  Score           :  0.5982960426785572


In [82]:
columns_features  = ['android.sensor.magnetic_field#mean', 'android.sensor.gyroscope#mean',   'android.sensor.game_rotation_vector#mean' ]


dt_train       = pd.read_csv('train.csv')
dt_validate    = pd.read_csv('validation.csv')
dt_test        = pd.read_csv('test.csv')

data_train     = dt_train[columns_names]
data_validate  = dt_validate[columns_names]
data_test      = dt_test[columns_names]

X_train        = data_train[columns_features]
y_train        = data_train['target']

X_validate     = data_validate[columns_features]
y_validate     = data_validate['target']

X_test         = data_test[columns_features]
y_test         = data_test['target']

""" STEP 1 and 2 """

best_score = 0

for n_estimator in [ 500, 400, 800, 200, 100]:

     """Step 1"""
     model_forest   = RandomForestClassifier(n_estimators=n_estimator)
     model_forest.fit(X_train, y_train)

     """Step 2"""
     score = model_forest.score(X_validate, y_validate)
     if score > best_score:
          best_score = score
          best_parameters = {'n_estimators' : n_estimator}

print('Best Validation Parameters : ', best_parameters)
print('Best Validation Score      : ', best_score)

""" Step 3"""
model_forest_paraSetting = RandomForestClassifier(n_estimators=best_parameters['n_estimators'])
model_forest_paraSetting.fit(X_train, y_train)
score_paraSetting = model_forest_paraSetting.score(X_test, y_test)
print('Best Test  Score           : ', score_paraSetting)


Best Validation Parameters :  {'n_estimators': 800}
Best Validation Score      :  0.542191435768262
Best Test  Score           :  0.2930965841229397


# FOUR PARAMETERS

In [83]:
columns_features  =['android.sensor.magnetic_field#mean', 'android.sensor.gyroscope#mean',  'speed#mean', 'android.sensor.game_rotation_vector#mean' ]

dt_train       = pd.read_csv('train.csv')
dt_validate    = pd.read_csv('validation.csv')
dt_test        = pd.read_csv('test.csv')

data_train     = dt_train[columns_names]
data_validate  = dt_validate[columns_names]
data_test      = dt_test[columns_names]

X_train        = data_train[columns_features]
y_train        = data_train['target']

X_validate     = data_validate[columns_features]
y_validate     = data_validate['target']

X_test         = data_test[columns_features]
y_test         = data_test['target']

""" STEP 1 and 2 """

best_score = 0

for n_estimator in [ 500, 400, 800, 200, 100]:

     """Step 1"""
     model_forest   = RandomForestClassifier(n_estimators=n_estimator)
     model_forest.fit(X_train, y_train)

     """Step 2"""
     score = model_forest.score(X_validate, y_validate)
     if score > best_score:
          best_score = score
          best_parameters = {'n_estimators' : n_estimator}

print('Best Validation Parameters : ', best_parameters)
print('Best Validation Score      : ', best_score)

""" Step 3"""
model_forest_paraSetting = RandomForestClassifier(n_estimators=best_parameters['n_estimators'])
model_forest_paraSetting.fit(X_train, y_train)
score_paraSetting = model_forest_paraSetting.score(X_test, y_test)
print('Best Test  Score           : ', score_paraSetting)


Best Validation Parameters :  {'n_estimators': 800}
Best Validation Score      :  0.5029387069689337
Best Test  Score           :  0.607134325981368


# TWELVE FEATURES

In [84]:
columns_features  =['android.sensor.accelerometer#mean', 'android.sensor.magnetic_field#mean', 'android.sensor.orientation#mean', 'android.sensor.gravity#mean', 'android.sensor.linear_acceleration#mean', 'android.sensor.gyroscope#mean', 'android.sensor.rotation_vector#mean', 'android.sensor.gyroscope_uncalibrated#mean', 'android.sensor.game_rotation_vector#mean', 'android.sensor.magnetic_field_uncalibrated#mean', 'sound#mean',  'speed#mean' ]

dt_train       = pd.read_csv('train.csv')
dt_validate    = pd.read_csv('validation.csv')
dt_test        = pd.read_csv('test.csv')

data_train     = dt_train[columns_names]
data_validate  = dt_validate[columns_names]
data_test      = dt_test[columns_names]

X_train        = data_train[columns_features]
y_train        = data_train['target']

X_validate     = data_validate[columns_features]
y_validate     = data_validate['target']

X_test         = data_test[columns_features]
y_test         = data_test['target']

""" STEP 1 and 2 """

best_score = 0

for n_estimator in [ 500, 400, 800, 200, 100]:

     """Step 1"""
     model_forest   = RandomForestClassifier(n_estimators=n_estimator)
     model_forest.fit(X_train, y_train)

     """Step 2"""
     score = model_forest.score(X_validate, y_validate)
     if score > best_score:
          best_score = score
          best_parameters = {'n_estimators' : n_estimator}

print('Best Validation Parameters : ', best_parameters)
print('Best Validation Score      : ', best_score)

""" Step 3"""
model_forest_paraSetting = RandomForestClassifier(n_estimators=best_parameters['n_estimators'])
model_forest_paraSetting.fit(X_train, y_train)
score_paraSetting = model_forest_paraSetting.score(X_test, y_test)
print('Best Test  Score           : ', score_paraSetting)


Best Validation Parameters :  {'n_estimators': 400}
Best Validation Score      :  0.5993912678421495
Best Test  Score           :  0.5618281710327255


In [52]:

# """ Shuffle all DataFrame"""
# dt_train = dt_train.sample(frac=1)
# dt_validate = dt_validate.sample(frac=1)
# dt_test = dt_test.sample(frac=1)

# model_tree     = DecisionTreeClassifier()
# model_forest   = RandomForestClassifier()

# val_score_tree = cross_val_score(model_tree, X_train, y_train)
# val_score_forest = cross_val_score(model_forest, X_train, y_train)

# cross_val_tree   = cross_validate(model_tree, X_train, y_train, return_train_score=True, cv=5)
# cross_val_forest = cross_validate(model_forest, X_train, y_train, return_train_score=True, cv=5)



X_train        = data_train[columns_features]
y_train        = data_train['target']

X_validate     = data_validate[columns_features]
y_validate     = data_validate['target']

X_test         = data_test[columns_features]
y_test         = data_test['target']

model_tree     = DecisionTreeClassifier()
model_forest   = RandomForestClassifier()


model_tree.fit(X_train, y_train)
model_forest.fit(X_train, y_train)

test_acc_tree   = model_tree.score(X_test, y_test)
test_acc_forest = model_forest.score(X_test, y_test)

validate_acc_tree = model_tree.score(X_validate, y_validate)
validate_acc_forest = model_forest.score(X_validate, y_validate)


In [53]:
print('Test Accuracy Tree   : ', test_acc_tree)
print('Test Accuracy Forest : ', test_acc_forest)
print( )
print('Validate Accuracy Tree   : ', validate_acc_tree)
print('Validate Accuracy Forest : ', validate_acc_forest)

Test Accuracy Tree   :  0.5480531889481647
Test Accuracy Forest :  0.5932000955490087

Validate Accuracy Tree   :  0.4922334172963896
Validate Accuracy Forest :  0.4862510495382032
