## Classification

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

In [3]:
# For randomization and re-producability of results
random.seed(123)
np.random.seed(123)

In [4]:
run_results=pd.DataFrame(columns=['Classifier', 'Mean Fit Time(s)', 'Mean Test Time(s)', 
                'Mean Train Score', 'Mean CV Score', 'Best Train Score','Test Score','F1 Score'])

In [5]:
df_data = pd.read_csv('../resources/data/expanded_data_v2.csv')

In [6]:
df_data = df_data.sample(frac=1, random_state=123).reset_index(drop=True)

In [7]:
# check for features which are highly co-related and remove

corr_matrix = df_data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
cols_to_remove=[column for column in upper.columns if any(upper[column] > 0.97)]

In [8]:
cols_to_remove

[]

In [9]:
df_data = df_data.drop(columns=cols_to_remove)

In [10]:
df_data.shape

(3500, 34)

In [11]:
df_data.head()

Unnamed: 0,d_core_nose,d_core_lelbow,d_core_relbow,d_core_lwrist,d_core_rwrist,d_core_lknee,d_core_rknee,d_core_lankle,d_core_rankle,d_lshoulder_lwrist,d_rshoulder_rwrist,d_lhip_lelbow,d_rhip_relbow,d_lshoulder_lknee,d_rshoulder_rknee,d_lhip_lankle,d_rhip_rankle,d_lknee_lfidx,d_rknee_rfidx,d_lwrist_rwrist,d_lelbow_relbow,d_lshoulder_rshoulder,d_lhip_rhip,d_lknee_rknee,a_elbows_neck,a_knees_hip,a_spine,a_core_ground,v_left_up,v_left_down,v_right_up,v_right_down,Class,SubClass
0,0.821917,0.240235,0.145781,0.275192,0.268853,1.256654,1.208919,2.133646,1.895589,0.837798,0.741616,0.650276,0.606798,1.786838,1.683875,1.551726,1.340691,0.894373,0.834819,0.006022,0.091019,0.090447,0.031091,0.04389,0.033414,0.970826,0.630864,0.560907,0.998451,0.952736,0.99221,0.623866,lunges,start
1,0.605611,0.914863,0.605855,0.859605,0.777918,1.072089,1.142277,1.143797,1.088999,0.361755,0.338954,1.412602,1.053164,1.605436,1.50084,0.623412,0.60238,1.059116,0.987218,0.147369,0.37724,0.178048,0.10124,0.016627,0.852375,0.00142,0.769855,0.375635,1.011535,1.015288,0.993032,0.671219,crunches,end
2,0.993892,0.667778,0.854203,1.036318,1.179891,1.344774,1.364955,2.155819,2.371929,0.753406,0.68645,1.097686,1.180817,1.777668,1.899362,1.649074,1.835984,0.900753,0.984986,0.13801,0.170951,0.177578,0.098258,0.073676,0.991378,0.996908,0.442946,0.215625,0.999175,0.621688,0.998014,0.987327,planks,planks
3,0.831321,0.115402,0.092763,0.352271,0.275765,1.259532,1.235923,2.029439,1.801466,0.756151,0.73095,0.496015,0.519265,1.647928,1.701989,1.520186,1.284016,0.958697,0.744694,0.099603,0.067411,0.071644,0.041806,0.098106,0.004488,0.008623,0.610965,0.56252,0.979851,0.96322,0.997037,0.702067,squats,start
4,1.033904,0.810301,0.593412,1.112249,0.961799,1.311834,1.293691,2.123221,2.03852,0.696007,0.686419,1.145276,1.009841,1.855232,1.758999,1.680237,1.539646,0.966781,0.884496,0.140984,0.208352,0.126165,0.060408,0.020436,0.9254,0.996623,0.494231,0.73743,0.999171,0.983033,0.998399,0.65804,planks,planks


In [11]:
df_data['SubClass'] = df_data['Class']+'-'+df_data['SubClass']

In [12]:
df_data['SubClass'].value_counts()

random-random          1000
planks-planks           500
lunges-start            250
crunches-end            250
squats-start            250
lunges-end              250
jumping_jacks-end       250
crunches-start          250
squats-end              250
jumping_jacks-start     250
Name: SubClass, dtype: int64

In [13]:
y_train = df_data['SubClass']
# x_train = df_data.drop(columns=['Class','SubClass'])
x_train = df_data.drop(columns=['SubClass'])

In [15]:
x_train, x_test, y_train, y_test = train_test_split( x_train, 
                        y_train, train_size=0.7, random_state=123, stratify=y_train)

In [14]:
subclas_encoder = LabelEncoder()
subclas_encoder.fit(y_train)

LabelEncoder()

In [15]:
clas_encoder = LabelEncoder()
clas_encoder.fit(x_train['Class'])

LabelEncoder()

In [16]:
x_train.loc[:,'Class'] = clas_encoder.transform(x_train['Class'])

In [None]:
x_test.loc[:,'Class'] = clas_encoder.transform(x_test['Class'])

In [18]:
y_train = subclas_encoder.transform(y_train)

In [26]:
y_test = subclas_encoder.transform(y_test)

In [27]:
def classifier_analyzer(classifier, params):
  ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=123)
  # we are explicitly passing StratifiedShuffleSplit because we want the CV data to
  # shuffles in each split which is not the default behaviour of GridSearchCV 
  gsCV = GridSearchCV(classifier, params, scoring='roc_auc_ovr', n_jobs=-1, refit=True, 
                                cv=ss, return_train_score=True, error_score='raise')
  gscv_result = gsCV.fit(x_train, y_train).cv_results_
  print("Mean fit time : %.3fs" % gscv_result['mean_fit_time'].mean())
  print("Mean test time : %.3fs" % gscv_result['mean_score_time'].mean())
  print("Mean train score : %.3f" % gscv_result['mean_train_score'].mean())
  print("Mean CV score : %.3f" % gscv_result['mean_test_score'].mean())

  # Get the train score on the best estimator
  print("Best Train Score : %.3f" % accuracy_score(y_train, gsCV.predict(x_train)))

  # Get the test score on the best estimator
  y_pred = gsCV.predict(x_test)
  print("Best Test Score  : %.3f" % accuracy_score(y_test, y_pred))

  print("Best params : ", gsCV.best_params_)
  return y_pred

In [28]:
# Logistic Regression

lr_clf = LogisticRegression(class_weight='balanced', random_state=123, n_jobs=-1)
# not all the combination of penalty and solver will be compatible so we define
# a list of params dict. First we fix the solver param, then go to fix C
lr_params = [{'penalty' : ['l2'], 
              'solver':['newton-cg', 'sag', 'lbfgs'] }, 
             {'penalty' : ['elasticnet'], 
              'solver':['saga'],
              'l1_ratio':[0, 0.25, 0.5, 0.75, 1]}]
y_pred = classifier_analyzer(lr_clf, lr_params)

Mean fit time : 0.726s
Mean test time : 0.013s
Mean train score : 1.000
Mean CV score : 1.000
Best Train Score : 0.998
Best Test Score  : 0.991
Best params :  {'penalty': 'l2', 'solver': 'newton-cg'}


In [29]:
print("Classification Report for the best params : ")
print(classification_report(y_test, y_pred, target_names=subclas_encoder.classes_))

Classification Report for the best params : 
                     precision    recall  f1-score   support

       crunches-end       1.00      1.00      1.00        75
     crunches-start       1.00      1.00      1.00        75
  jumping_jacks-end       1.00      1.00      1.00        75
jumping_jacks-start       1.00      1.00      1.00        75
         lunges-end       1.00      1.00      1.00        75
       lunges-start       1.00      1.00      1.00        75
      planks-planks       0.96      1.00      0.98       150
      random-random       1.00      0.97      0.98       300
         squats-end       0.99      1.00      0.99        75
       squats-start       0.99      1.00      0.99        75

           accuracy                           0.99      1050
          macro avg       0.99      1.00      0.99      1050
       weighted avg       0.99      0.99      0.99      1050



In [30]:
run_results.loc[run_results.shape[0]]=['Logistic Reg', 0.726, 0.013, 1.0, 
                                       1.0, 0.998, 0.991, 0.99]

In [31]:
# KNN 

knn_clf = KNeighborsClassifier(n_jobs=-1)
knn_params = {'n_neighbors':[3,5,7,10,15], 
              'weights':['uniform','distance'], 
             'metric':['cosine','minkowski','euclidean']}

y_pred = classifier_analyzer(knn_clf, knn_params)

Mean fit time : 0.007s
Mean test time : 0.121s
Mean train score : 1.000
Mean CV score : 1.000
Best Train Score : 1.000
Best Test Score  : 1.000
Best params :  {'metric': 'cosine', 'n_neighbors': 3, 'weights': 'uniform'}


In [32]:
print("Classification Report for the best params : ")
print(classification_report(y_test, y_pred, target_names=subclas_encoder.classes_))

Classification Report for the best params : 
                     precision    recall  f1-score   support

       crunches-end       1.00      1.00      1.00        75
     crunches-start       1.00      1.00      1.00        75
  jumping_jacks-end       1.00      1.00      1.00        75
jumping_jacks-start       1.00      1.00      1.00        75
         lunges-end       1.00      1.00      1.00        75
       lunges-start       1.00      1.00      1.00        75
      planks-planks       1.00      1.00      1.00       150
      random-random       1.00      1.00      1.00       300
         squats-end       1.00      1.00      1.00        75
       squats-start       1.00      1.00      1.00        75

           accuracy                           1.00      1050
          macro avg       1.00      1.00      1.00      1050
       weighted avg       1.00      1.00      1.00      1050



In [33]:
run_results.loc[run_results.shape[0]]=['kNN', 0.007, 0.121, 1.0, 1.0, 
                                       1.00, 1.0, 1.0]

In [34]:
# SVM 

svm_clf = SVC(probability=True, class_weight='balanced', random_state=123)
svm_params = {'C':[0.001, 0.01, 0.1, 1, 10], 
              'kernel':['rbf', 'poly', 'sigmoid']}

y_pred = classifier_analyzer(svm_clf, svm_params)

Mean fit time : 2.284s
Mean test time : 0.112s
Mean train score : 0.814
Mean CV score : 0.814
Best Train Score : 1.000
Best Test Score  : 1.000
Best params :  {'C': 1, 'kernel': 'poly'}


In [35]:
print("Classification Report for the best params : ")
print(classification_report(y_test, y_pred, target_names=subclas_encoder.classes_))

Classification Report for the best params : 
                     precision    recall  f1-score   support

       crunches-end       1.00      1.00      1.00        75
     crunches-start       1.00      1.00      1.00        75
  jumping_jacks-end       1.00      1.00      1.00        75
jumping_jacks-start       1.00      1.00      1.00        75
         lunges-end       1.00      1.00      1.00        75
       lunges-start       1.00      1.00      1.00        75
      planks-planks       1.00      1.00      1.00       150
      random-random       1.00      1.00      1.00       300
         squats-end       1.00      1.00      1.00        75
       squats-start       1.00      1.00      1.00        75

           accuracy                           1.00      1050
          macro avg       1.00      1.00      1.00      1050
       weighted avg       1.00      1.00      1.00      1050



In [36]:
run_results.loc[run_results.shape[0]]=['SVM', 2.284, 0.112, 0.814, 
                                       0.814, 1.0, 1.0, 1.0]

In [37]:
# SGD Classifier 

sgd_clf = SGDClassifier(class_weight='balanced', early_stopping=False, n_jobs=-1, random_state=123)

sgd_params = {'loss': ['log', 'modified_huber'], 
              'penalty': ['l2', 'elasticnet'], 
              'max_iter': [100, 300, 500, 700],
              'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
              'epsilon': [0.01, 0.05, 0.1]}

y_pred = classifier_analyzer(sgd_clf, sgd_params)

Mean fit time : 0.108s
Mean test time : 0.014s
Mean train score : 0.992
Mean CV score : 0.992
Best Train Score : 0.984
Best Test Score  : 0.974
Best params :  {'alpha': 0.0001, 'epsilon': 0.01, 'loss': 'log', 'max_iter': 100, 'penalty': 'elasticnet'}


In [38]:
print("Classification Report for the best params : ")
print(classification_report(y_test, y_pred, target_names=subclas_encoder.classes_))

Classification Report for the best params : 
                     precision    recall  f1-score   support

       crunches-end       1.00      1.00      1.00        75
     crunches-start       1.00      1.00      1.00        75
  jumping_jacks-end       1.00      1.00      1.00        75
jumping_jacks-start       1.00      1.00      1.00        75
         lunges-end       1.00      1.00      1.00        75
       lunges-start       1.00      1.00      1.00        75
      planks-planks       0.88      1.00      0.93       150
      random-random       1.00      0.91      0.95       300
         squats-end       0.95      1.00      0.97        75
       squats-start       0.99      0.99      0.99        75

           accuracy                           0.97      1050
          macro avg       0.98      0.99      0.98      1050
       weighted avg       0.98      0.97      0.97      1050



In [40]:
run_results.loc[run_results.shape[0]]=['SGD Classifier', 0.108, 0.014, 0.992, 
                                       0.992, 0.984, 0.974, 0.97]

In [44]:
# XGBoost 

xgb_clf = XGBClassifier(random_state=123, n_jobs=-1, use_label_encoder=False)

# First we fix the objective param then, others
xgb_params = [{'objective': ['binary:logistic', 'binary:hinge', 
                            'multi:softprob','multi:softmax']
              },{
                  'objective' : ['multi:softmax'],
                  'num_class' : [len(set(y_train))]
              }]

y_pred = classifier_analyzer(xgb_clf, xgb_params)

Mean fit time : 8.288s
Mean test time : 0.091s
Mean train score : 1.000
Mean CV score : 1.000
Best Train Score : 1.000
Best Test Score  : 0.999
Best params :  {'objective': 'binary:logistic'}


In [45]:
xgb_clf = XGBClassifier(objective='binary:logistic', random_state=123, n_jobs=-1)

# First we fix the objective param then, others
xgb_params = {
              'max_depth' : [3, 5, 7],
              'n_estimators':[5,10,20,35,60],
              'learning_rate' : [0.1, 0.2, 0.3, 0.5, 0.7]
            }
y_pred = classifier_analyzer(xgb_clf, xgb_params)



Mean fit time : 4.729s
Mean test time : 0.125s
Mean train score : 1.000
Mean CV score : 1.000
Best Train Score : 1.000
Best Test Score  : 0.999
Best params :  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 60}


In [51]:
print("Classification Report for the best params : ")
print(classification_report(y_test, y_pred, target_names=subclas_encoder.classes_))

Classification Report for the best params : 
                     precision    recall  f1-score   support

       crunches-end       0.99      1.00      0.99        75
     crunches-start       1.00      0.99      0.99        75
  jumping_jacks-end       1.00      1.00      1.00        75
jumping_jacks-start       1.00      1.00      1.00        75
         lunges-end       1.00      1.00      1.00        75
       lunges-start       1.00      1.00      1.00        75
      planks-planks       1.00      1.00      1.00       150
      random-random       1.00      1.00      1.00       300
         squats-end       1.00      1.00      1.00        75
       squats-start       1.00      1.00      1.00        75

           accuracy                           1.00      1050
          macro avg       1.00      1.00      1.00      1050
       weighted avg       1.00      1.00      1.00      1050



In [52]:
run_results.loc[run_results.shape[0]]=['XGBoost', 4.729, 0.125, 1.0, 1.0, 
                                            1.0, 0.99, 1.0]

In [18]:
xgb_clf = XGBClassifier(n_estimators=60, max_depth=3, objective='binary:logistic', 
                        learning_rate=0.5, class_weight='balanced', 
                        random_state=123, n_jobs=-1, use_label_encoder=False)

xgb_clf.fit(x_train, y_train)

Parameters: { "class_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', class_weight='balanced',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              enable_categorical=False, gamma=0, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=60, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=123,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

In [53]:
mnb_clf = MultinomialNB()
mnb_params = {'alpha': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]}
y_pred = classifier_analyzer(mnb_clf, mnb_params)

Mean fit time : 0.020s
Mean test time : 0.016s
Mean train score : 0.977
Mean CV score : 0.978
Best Train Score : 0.719
Best Test Score  : 0.730
Best params :  {'alpha': 1.0}


In [54]:
print("Classification Report for the best params : ")
print(classification_report(y_test, y_pred, target_names=subclas_encoder.classes_))

Classification Report for the best params : 
                     precision    recall  f1-score   support

       crunches-end       0.54      0.49      0.51        75
     crunches-start       0.53      0.57      0.55        75
  jumping_jacks-end       1.00      1.00      1.00        75
jumping_jacks-start       0.84      1.00      0.91        75
         lunges-end       0.80      1.00      0.89        75
       lunges-start       0.88      0.92      0.90        75
      planks-planks       0.67      0.73      0.70       150
      random-random       0.65      0.66      0.66       300
         squats-end       1.00      0.28      0.44        75
       squats-start       0.84      0.83      0.83        75

           accuracy                           0.73      1050
          macro avg       0.78      0.75      0.74      1050
       weighted avg       0.74      0.73      0.72      1050



In [55]:
run_results.loc[run_results.shape[0]]=['MultiNomial NB', 0.020, 0.016, 0.977, 
                                                0.978, 0.719, 0.730, 0.73]

In [56]:
# Results on class 
run_results.sort_values(by=['F1 Score', 'Test Score'], ascending=False)

Unnamed: 0,Classifier,Mean Fit Time(s),Mean Test Time(s),Mean Train Score,Mean CV Score,Best Train Score,Test Score,F1 Score
1,kNN,0.007,0.121,1.0,1.0,1.0,1.0,1.0
2,SVM,2.284,0.112,0.814,0.814,1.0,1.0,1.0
4,XGBoost,4.729,0.125,1.0,1.0,1.0,0.99,1.0
0,Logistic Reg,0.726,0.013,1.0,1.0,0.998,0.991,0.99
3,SGD Classifier,0.108,0.014,0.992,0.992,0.984,0.974,0.97
5,MultiNomial NB,0.02,0.016,0.977,0.978,0.719,0.73,0.73


In [22]:
from joblib import dump, load

In [21]:
dump(xgb_clf , '../resources/models/cascade/xgb_clf.model')

['../resources/models/cascade/xgb_clf.model']

In [24]:
dump(knn_clf, '../resources/models/cascade/knn_clf.model')

['../resources/models/cascade/knn_clf.model']

In [25]:
dump(subclas_encoder, '../resources/models/cascade/subclas_encoder.model')

['../resources/models/cascade/subclas_encoder.model']

In [26]:
dump(clas_encoder, '../resources/models/cascade/clas_encoder.model')

['../resources/models/cascade/clas_encoder.model']