In [538]:
import os
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import bootstrap

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix, roc_curve, auc
import statsmodels.api as sm
from sklearn.utils import resample
import re

import warnings
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [539]:
# removing extra comma in cycling dataset9.csv and cycling dataset14.csv manually as suggested on piazza
# bending2 dataset4.csv space separated (delimitter/sep_)
# https://www.geeksforgeeks.org/how-to-use-glob-function-to-find-files-recursively-in-python/
path = '../../data/AReM/'
actv_dir = ['bending1', 'bending2']
actv_dir2 = ['bending1','bending2','cycling', 'lying', 'sitting', 'standing', 'walking']
features = ['mean_', 'std_', 'min_', 'first_quartile_', 'median_', 'third_quartile_','max_']
columns = ['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']


feature_names = ['min','max','mean','median','sd','first_quartile','third_quartile']
features = [name+str(i) for i in range(1,7) for name in feature_names]
features.append('activity')
column_names = ['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']

In [540]:


def train_test_data():
    train_set = []
    test_set = []
    all_set = []
    # scanning bottom-to-up
    for root, dirs, files in os.walk(path, topdown=False):
        for name in files:
            # Split the path in root and ext pair
            root_ext = os.path.splitext(name)
            # checking the extension
            root_val = root_ext[1]
            if root_val.endswith('.csv'):
                # https://www.geeksforgeeks.org/python-regex-re-search-vs-re-findall/
                file_path = os.path.join(root+ '/' +name)
                # print(file_path)
                # print(file_path)
                my_path = os.path.split(root)
                feature_act = my_path[-1] #to get just the activity name
                # my_path = Path(path)
                # activity = my_path.stem
                regex = '\d+'
                match = re.findall(regex, name)
                # for bending 1 and 2 and dataset 1,2 and 3
                match_activity = int(match[0])
                bool1 = name.find('dataset1.csv')!= -1
                bool2 = name.find('dataset2.csv')!=-1
                bool3 = name.find('dataset3.csv')!=-1
                if root.find('bending') != -1:
                    if (bool1 or bool2):
                        test_set.append(file_path)
                        all_set.append(file_path)
                    else:
                        train_set.append(file_path)
                        all_set.append(file_path)
                # dataset 1,2 and 3
                    # bool2 = match_activity == 3 and feature_act not in actv_dir
                else:
                    if (bool1 or bool2 or bool3):
                        test_set.append(file_path)
                        all_set.append(file_path)
                    else:
                        train_set.append(file_path)
                        all_set.append(file_path)
    return train_set, test_set, all_set
train_set, test_set, all_set = train_test_data()

In [541]:
train_set

['../../data/AReM/bending1/dataset7.csv',
 '../../data/AReM/bending1/dataset6.csv',
 '../../data/AReM/bending1/dataset4.csv',
 '../../data/AReM/bending1/dataset5.csv',
 '../../data/AReM/bending1/dataset3.csv',
 '../../data/AReM/walking/dataset7.csv',
 '../../data/AReM/walking/dataset6.csv',
 '../../data/AReM/walking/dataset4.csv',
 '../../data/AReM/walking/dataset5.csv',
 '../../data/AReM/walking/dataset10.csv',
 '../../data/AReM/walking/dataset11.csv',
 '../../data/AReM/walking/dataset13.csv',
 '../../data/AReM/walking/dataset12.csv',
 '../../data/AReM/walking/dataset15.csv',
 '../../data/AReM/walking/dataset14.csv',
 '../../data/AReM/walking/dataset8.csv',
 '../../data/AReM/walking/dataset9.csv',
 '../../data/AReM/bending2/dataset6.csv',
 '../../data/AReM/bending2/dataset4.csv',
 '../../data/AReM/bending2/dataset5.csv',
 '../../data/AReM/bending2/dataset3.csv',
 '../../data/AReM/standing/dataset7.csv',
 '../../data/AReM/standing/dataset6.csv',
 '../../data/AReM/standing/dataset4.csv'

In [542]:
feature_names = ['min','max','mean','median','sd','first_quartile','third_quartile']
features = [name+str(i) for i in range(1,7) for name in feature_names]
features.append('activity')
column_names = ['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']

In [543]:
def load_data(data, features, folds=1):
    sniffer = csv.Sniffer()
    
    # Create Empty DataFrame with column features
    return_data = pd.DataFrame(columns=features)
    
    for file in data:
        file_activity = file.split('/')[-2]
        # Get the delimiter in the CSV file
        
        read_file = sniffer.sniff(open(file).readlines()[6])
        
        delimiter = read_file.delimiter
        
        # Read data using delimiter
        if delimiter == ',':
            file_data = pd.read_csv(file, sep = delimiter, skiprows = 5, header=None, on_bad_lines='skip')
        else:
            file_data = pd.read_csv(file, sep = delimiter, skiprows = 5, header=None, index_col=False, on_bad_lines='skip')
            file_data = file_data.drop([7], axis=1)

        file_data = file_data.drop([0], axis=1)
        
        # Divide into folds
        fold_data_break = int(len(file_data)/folds)
        
        lst = []
        for idx, col in enumerate(file_data.columns):
            for fold in range(0, folds):
                if fold == folds-1:
                    data = file_data[fold * fold_data_break : len(file_data)]
                else: 
                    data = file_data[fold * fold_data_break : (fold+1) * fold_data_break]
                    
                lst.extend([np.min(data[col]),np.max(data[col]),np.mean(data[col]),np.median(data[col]),
                            np.std(data[col]),np.percentile(data[col],25),np.percentile(data[col], 75)])
                
        lst.append(file_activity)   
        lst_data = pd.Series(lst, index = features)
        return_data = return_data.append(lst_data, ignore_index=True)
            
    return return_data

In [544]:
train_new_data = load_data(train_set, features)
test_new_data = load_data(test_set, features)
all_new_data = load_data(all_set, features)

In [545]:
all_new_data

Unnamed: 0,min1,max1,mean1,median1,sd1,first_quartile1,third_quartile1,min2,max2,mean2,...,first_quartile5,third_quartile5,min6,max6,mean6,median6,sd6,first_quartile6,third_quartile6,activity
0,36.25,48.00,43.969125,44.50,1.616677,43.3100,44.67,0.0,1.50,0.413125,...,20.5000,23.7500,0.0,2.96,0.555312,0.490,0.487318,0.0000,0.8300,bending1
1,37.00,48.00,43.454958,43.25,1.384653,42.5000,45.00,0.0,1.58,0.378083,...,22.2500,24.0000,0.0,5.26,0.679646,0.500,0.621885,0.4300,0.8700,bending1
2,33.00,47.75,42.179812,43.50,3.666840,39.1500,45.00,0.0,3.00,0.696042,...,30.4575,36.3300,0.0,2.18,0.613521,0.500,0.523771,0.0000,1.0000,bending1
3,33.00,45.75,41.678063,41.75,2.241152,41.3300,42.75,0.0,2.83,0.535979,...,28.4575,31.2500,0.0,1.79,0.383292,0.430,0.388759,0.0000,0.5000,bending1
4,37.25,45.00,40.624792,40.50,1.475428,39.2500,42.00,0.0,1.30,0.358604,...,33.0000,36.0000,0.0,1.92,0.570583,0.430,0.582308,0.0000,1.3000,bending1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,18.50,44.25,35.752354,36.00,4.609992,33.0000,39.33,0.0,12.60,3.328104,...,14.0000,18.0625,0.0,9.39,3.069667,2.770,1.746503,1.7975,4.0600,cycling
84,24.25,45.00,37.177042,36.25,3.577569,34.5000,40.25,0.0,8.58,2.374208,...,17.9500,21.7500,0.0,9.34,2.921729,2.500,1.850669,1.5000,3.9000,cycling
85,23.33,43.50,36.244083,36.75,3.818032,33.4575,39.25,0.0,9.71,2.736021,...,15.7500,21.0000,0.0,11.15,3.530500,3.110,1.961639,2.1700,4.6175,cycling
86,26.25,44.25,36.957458,36.29,3.431283,34.5000,40.25,0.0,8.64,2.420083,...,14.0000,18.2500,0.0,8.34,2.934625,2.525,1.629680,1.6600,4.0300,cycling


In [546]:
all_new_data.describe().loc['std']

min1               9.569975
max1               4.394362
mean1              5.335718
median1            5.440054
sd1                1.770306
first_quartile1    6.153590
third_quartile1    5.138925
min2               0.000000
max2               5.062729
mean2              1.574164
median2            1.412244
sd2                0.883184
first_quartile2    0.946386
third_quartile2    2.125266
min3               2.956462
max3               4.875137
mean3              4.008380
median3            4.036396
sd3                0.945724
first_quartile3    4.220658
third_quartile3    4.171628
min4               0.000000
max4               2.183625
mean4              1.166114
median4            1.145586
sd4                0.457764
first_quartile4    0.843620
third_quartile4    1.552504
min5               6.124001
max5               5.741238
mean5              5.675593
median5            5.813782
sd5                1.023830
first_quartile5    6.096465
third_quartile5    5.531720
min6               0

In [547]:
def bootstrap_features(data):
    bootstrap_data = pd.DataFrame(columns=['feature', 'low', 'high'])
    for feature in data.columns:
        lst = []
        res = bootstrap((data[[feature]].values,), np.std, confidence_level=0.9, method='percentile')
        lst.extend([feature,res.confidence_interval.low[0],res.confidence_interval.high[0]])
        lst_data = pd.Series(lst, index = ['feature', 'low', 'high'])
        bootstrap_data = bootstrap_data.append(lst_data, ignore_index=True)
  
    return bootstrap_data

In [548]:
bootstrap_data = bootstrap_features(all_new_data.iloc[:,:-1])
bootstrap_data

Unnamed: 0,feature,low,high
0,min1,8.225308,10.725088
1,max1,3.292972,5.233575
2,mean1,4.689868,5.841104
3,median1,4.761833,5.958941
4,sd1,1.557874,1.936576
5,first_quartile1,5.534596,6.612814
6,third_quartile1,4.312042,5.827362
7,min2,0.0,0.0
8,max2,4.589207,5.377247
9,mean2,1.386902,1.695859


In [549]:
# Add new column bending because from question 2. (a) , it's about binary classification
# If activity = 'bending1' or 'bending2', then bending = 1, else bending = 0

activity_all_data = all_new_data
activity_all_data['bending'] = [1 if data.find('bending') != -1 else 0 
                                for data in activity_all_data['activity'].to_list()]

activity_all_data = activity_all_data.drop(['activity'], axis=1)
activity_all_data

Unnamed: 0,min1,max1,mean1,median1,sd1,first_quartile1,third_quartile1,min2,max2,mean2,...,first_quartile5,third_quartile5,min6,max6,mean6,median6,sd6,first_quartile6,third_quartile6,bending
0,36.25,48.00,43.969125,44.50,1.616677,43.3100,44.67,0.0,1.50,0.413125,...,20.5000,23.7500,0.0,2.96,0.555312,0.490,0.487318,0.0000,0.8300,1
1,37.00,48.00,43.454958,43.25,1.384653,42.5000,45.00,0.0,1.58,0.378083,...,22.2500,24.0000,0.0,5.26,0.679646,0.500,0.621885,0.4300,0.8700,1
2,33.00,47.75,42.179812,43.50,3.666840,39.1500,45.00,0.0,3.00,0.696042,...,30.4575,36.3300,0.0,2.18,0.613521,0.500,0.523771,0.0000,1.0000,1
3,33.00,45.75,41.678063,41.75,2.241152,41.3300,42.75,0.0,2.83,0.535979,...,28.4575,31.2500,0.0,1.79,0.383292,0.430,0.388759,0.0000,0.5000,1
4,37.25,45.00,40.624792,40.50,1.475428,39.2500,42.00,0.0,1.30,0.358604,...,33.0000,36.0000,0.0,1.92,0.570583,0.430,0.582308,0.0000,1.3000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,18.50,44.25,35.752354,36.00,4.609992,33.0000,39.33,0.0,12.60,3.328104,...,14.0000,18.0625,0.0,9.39,3.069667,2.770,1.746503,1.7975,4.0600,0
84,24.25,45.00,37.177042,36.25,3.577569,34.5000,40.25,0.0,8.58,2.374208,...,17.9500,21.7500,0.0,9.34,2.921729,2.500,1.850669,1.5000,3.9000,0
85,23.33,43.50,36.244083,36.75,3.818032,33.4575,39.25,0.0,9.71,2.736021,...,15.7500,21.0000,0.0,11.15,3.530500,3.110,1.961639,2.1700,4.6175,0
86,26.25,44.25,36.957458,36.29,3.431283,34.5000,40.25,0.0,8.64,2.420083,...,14.0000,18.2500,0.0,8.34,2.934625,2.525,1.629680,1.6600,4.0300,0


In [550]:
# Add new column bending because from 'd' question, it's about binary classification
# If activity = 'bending1' or 'bending2', then bending = 1, else bending = 0

activity_all_data = all_new_data
activity_all_data['bending'] = [1 if data.find('bending') != -1 else 0 
                                for data in activity_all_data['activity'].to_list()]

activity_all_data = activity_all_data.drop(['activity'], axis=1)
activity_all_data

Unnamed: 0,min1,max1,mean1,median1,sd1,first_quartile1,third_quartile1,min2,max2,mean2,...,first_quartile5,third_quartile5,min6,max6,mean6,median6,sd6,first_quartile6,third_quartile6,bending
0,36.25,48.00,43.969125,44.50,1.616677,43.3100,44.67,0.0,1.50,0.413125,...,20.5000,23.7500,0.0,2.96,0.555312,0.490,0.487318,0.0000,0.8300,1
1,37.00,48.00,43.454958,43.25,1.384653,42.5000,45.00,0.0,1.58,0.378083,...,22.2500,24.0000,0.0,5.26,0.679646,0.500,0.621885,0.4300,0.8700,1
2,33.00,47.75,42.179812,43.50,3.666840,39.1500,45.00,0.0,3.00,0.696042,...,30.4575,36.3300,0.0,2.18,0.613521,0.500,0.523771,0.0000,1.0000,1
3,33.00,45.75,41.678063,41.75,2.241152,41.3300,42.75,0.0,2.83,0.535979,...,28.4575,31.2500,0.0,1.79,0.383292,0.430,0.388759,0.0000,0.5000,1
4,37.25,45.00,40.624792,40.50,1.475428,39.2500,42.00,0.0,1.30,0.358604,...,33.0000,36.0000,0.0,1.92,0.570583,0.430,0.582308,0.0000,1.3000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,18.50,44.25,35.752354,36.00,4.609992,33.0000,39.33,0.0,12.60,3.328104,...,14.0000,18.0625,0.0,9.39,3.069667,2.770,1.746503,1.7975,4.0600,0
84,24.25,45.00,37.177042,36.25,3.577569,34.5000,40.25,0.0,8.58,2.374208,...,17.9500,21.7500,0.0,9.34,2.921729,2.500,1.850669,1.5000,3.9000,0
85,23.33,43.50,36.244083,36.75,3.818032,33.4575,39.25,0.0,9.71,2.736021,...,15.7500,21.0000,0.0,11.15,3.530500,3.110,1.961639,2.1700,4.6175,0
86,26.25,44.25,36.957458,36.29,3.431283,34.5000,40.25,0.0,8.64,2.420083,...,14.0000,18.2500,0.0,8.34,2.934625,2.525,1.629680,1.6600,4.0300,0


In [551]:
features_graph=[]
for ele in features:
    if ele not in {'min1','min2', 'min3', 'min4', 'min5', 'min6', 'activity'}:
        features_graph.append(ele)

In [552]:
# Selecting features from 1(c)iv and considering only time series 1,2 and 6 
binary_feature_names = ['max','mean','third_quartile']
binary_features = [name+str(i) for i in [1,2,6] for name in binary_feature_names]

#print(binary_feature_names)
#print(binary_features)

In [553]:
activity_train_set = train_new_data
activity_train_set['bending'] = [1 if data.find('bending') != -1 else 0 
                                for data in activity_train_set['activity'].to_list()]

activity_train_set = activity_train_set.drop(['activity'], axis=1)

In [554]:
def plots(data, features, nrows, ncols, row_size, col_size):
    if features[-1] != 'bending':
        features.append('bending')
        
    data = data[features]
    
    main=[]
    for i in range(0,len(data.columns)-1):
        for j in range(0,len(data.columns)-1):
            main.append([features[i], features[j]])
         
    plt.close()

    fig, axs = plt.subplots(nrows= nrows, ncols=ncols, figsize=(row_size,col_size))
    
    k=0
    for i in range(0,ncols):
        for j in range(0,nrows):
            sns.scatterplot(data=data, x=main[k][0], y=main[k][1], hue="bending", ax=axs[j][i])
            k+=1
            
    plt.show()

In [555]:
# plots(activity_train_set, binary_features, 9, 9, 30, 40)

In [556]:
feature_names_fold2 = ['min','max','mean','median','sd','first_quartile','third_quartile']
features_fold2 = [name+str(i) for i in range(1,13) for name in feature_names]
features_fold2.append('activity')

In [557]:
all_new_data_fold2 = load_data(all_set, features_fold2, 2)
all_new_data_fold2

Unnamed: 0,min1,max1,mean1,median1,sd1,first_quartile1,third_quartile1,min2,max2,mean2,...,first_quartile11,third_quartile11,min12,max12,mean12,median12,sd12,first_quartile12,third_quartile12,activity
0,36.67,45.00,43.486208,43.71,1.279302,42.3300,44.50,36.25,48.00,44.452042,...,0.0000,0.8300,0.0,2.96,0.585750,0.50,0.486900,0.4225,0.8300,bending1
1,39.00,48.00,44.117042,45.00,1.357000,43.5000,45.00,37.00,46.50,42.792875,...,0.4300,1.1200,0.0,4.06,0.543875,0.50,0.472458,0.4300,0.7100,bending1
2,33.75,47.75,43.278875,45.00,3.466111,42.0000,45.25,33.00,46.00,41.080750,...,0.0000,1.2200,0.0,2.18,0.586083,0.47,0.490847,0.0000,0.8700,bending1
3,33.00,45.75,41.621208,42.33,3.112140,39.6525,44.25,39.25,43.67,41.734917,...,0.0000,0.7100,0.0,1.50,0.347500,0.43,0.362386,0.0000,0.5000,bending1
4,38.00,42.33,40.946958,41.25,1.100662,40.3100,42.00,37.25,45.00,40.302625,...,0.0000,1.3000,0.0,1.92,0.552167,0.43,0.509430,0.0000,0.9400,bending1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,20.00,44.25,35.692167,35.50,4.633910,33.0000,39.25,18.50,44.00,35.812542,...,1.7900,4.3500,0.0,8.81,2.966917,2.59,1.708491,1.8525,3.7400,cycling
84,24.25,45.00,36.954208,36.00,3.764968,34.2500,40.25,29.00,44.67,37.399875,...,1.7525,3.9375,0.0,9.34,2.838083,2.45,1.886811,1.3825,3.8325,cycling
85,27.00,42.67,36.342750,36.75,3.640886,33.6700,39.00,23.33,43.50,36.145417,...,2.2525,4.7025,0.0,11.15,3.513250,3.08,1.974270,2.1700,4.5000,cycling
86,26.25,43.40,36.865417,36.25,3.455571,34.3100,40.00,29.25,44.25,37.049500,...,1.6550,4.0375,0.0,8.34,2.904458,2.57,1.606436,1.6900,3.7700,cycling


In [558]:
binary_features_fold2 = [name+str(i) for i in [1,2,3,4,11,12] for name in binary_feature_names]

In [559]:
activity_train_set_fold2 = all_new_data_fold2
activity_train_set_fold2['bending'] = [1 if data.find('bending') != -1 else 0 
                                for data in activity_train_set_fold2['activity'].to_list()]

activity_train_set_fold2 = activity_train_set_fold2.drop(['activity'], axis=1)
activity_train_set_fold2

Unnamed: 0,min1,max1,mean1,median1,sd1,first_quartile1,third_quartile1,min2,max2,mean2,...,first_quartile11,third_quartile11,min12,max12,mean12,median12,sd12,first_quartile12,third_quartile12,bending
0,36.67,45.00,43.486208,43.71,1.279302,42.3300,44.50,36.25,48.00,44.452042,...,0.0000,0.8300,0.0,2.96,0.585750,0.50,0.486900,0.4225,0.8300,1
1,39.00,48.00,44.117042,45.00,1.357000,43.5000,45.00,37.00,46.50,42.792875,...,0.4300,1.1200,0.0,4.06,0.543875,0.50,0.472458,0.4300,0.7100,1
2,33.75,47.75,43.278875,45.00,3.466111,42.0000,45.25,33.00,46.00,41.080750,...,0.0000,1.2200,0.0,2.18,0.586083,0.47,0.490847,0.0000,0.8700,1
3,33.00,45.75,41.621208,42.33,3.112140,39.6525,44.25,39.25,43.67,41.734917,...,0.0000,0.7100,0.0,1.50,0.347500,0.43,0.362386,0.0000,0.5000,1
4,38.00,42.33,40.946958,41.25,1.100662,40.3100,42.00,37.25,45.00,40.302625,...,0.0000,1.3000,0.0,1.92,0.552167,0.43,0.509430,0.0000,0.9400,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,20.00,44.25,35.692167,35.50,4.633910,33.0000,39.25,18.50,44.00,35.812542,...,1.7900,4.3500,0.0,8.81,2.966917,2.59,1.708491,1.8525,3.7400,0
84,24.25,45.00,36.954208,36.00,3.764968,34.2500,40.25,29.00,44.67,37.399875,...,1.7525,3.9375,0.0,9.34,2.838083,2.45,1.886811,1.3825,3.8325,0
85,27.00,42.67,36.342750,36.75,3.640886,33.6700,39.00,23.33,43.50,36.145417,...,2.2525,4.7025,0.0,11.15,3.513250,3.08,1.974270,2.1700,4.5000,0
86,26.25,43.40,36.865417,36.25,3.455571,34.3100,40.00,29.25,44.25,37.049500,...,1.6550,4.0375,0.0,8.34,2.904458,2.57,1.606436,1.6900,3.7700,0


In [560]:
# plots(activity_train_set_fold2, binary_features_fold2, 27, 9, 30, 160)

In [561]:
def downsampling(data):
    bending_1 = data.loc[data['bending'] == 1]
    bending_0 = data.loc[data['bending'] == 0]
        
    # Finding the min and max count of each label
    min_count = min(bending_1.shape[0], bending_0.shape[0])
    max_count = max(bending_1.shape[0], bending_0.shape[0])
        
    # Finding the major sample class
    max_bending = 1 if bending_1.shape[0] > bending_0.shape[0] else 0
        
    # Dividing into minor and major samples
    major_sample = data.loc[data['bending'] == max_bending]
    minor_sample = data.loc[data['bending'] != max_bending]
        
    # Resampling the major_sample
    major_sample = resample(major_sample, n_samples = min_count * 3, replace=True, random_state=7)
        
    data = pd.concat([minor_sample, major_sample], ignore_index=True)
    return data

In [562]:
def logisticRegressionCV(train_files, k, L, overSample=False, penalty='none'):    
    feature_names_lcv = ['min','max','mean','median','sd','first_quartile','third_quartile']
    features_lcv = [name+str(i) for i in range(1,6*L+1) for name in feature_names_lcv]
    features_lcv.append('activity')
    
    train_lcv = load_data(train_files, features_lcv, L)
    train_lcv['bending'] = [1 if data.find('bending') != -1 else 0 
                            for data in train_lcv['activity'].to_list()]
    train_lcv = train_lcv.drop(['activity'], axis=1)
    
    #Using DownSampling technique
    if overSample:
        train_lcv = downsampling(train_lcv)
    
    feature_names_lcv_updated = ['max','mean','third_quartile']
    features_lcv_updated = [name+str(i) for i in range(1,6*L+1) for name in feature_names_lcv_updated]
    features_lcv_updated.append('bending')
    
    train_lcv = train_lcv[features_lcv_updated]
    
    train_lcv_X = train_lcv.iloc[:,:-1]
    train_lcv_Y = train_lcv.iloc[:,-1]
    
    stratified_cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=7)
    lr_model = LogisticRegression(penalty=penalty, solver='lbfgs')
    feature_selector = RFECV(estimator=lr_model, cv=stratified_cv)
    feature_selector.fit(train_lcv_X, train_lcv_Y)
    
    return feature_selector, lr_model

In [563]:
def get_selected_features(lcv_result, overSample=False):
    # Best L
    best_pair = lcv_result.loc[lcv_result['score'] == max(lcv_result['score'])]
    best_L = best_pair['L'].to_list()[0]
    
    # Feature names for best L
    feature_names_best_lcv = ['max','mean','third_quartile']
    features_best_lcv = [name+str(i) for i in range(1,6*best_L+1) for name in feature_names_best_lcv]

    # Selected features for best L
    feature_selector, lr_model = logisticRegressionCV(train_set, 5, best_L, overSample=overSample)
    selected_features = [f for i, f in enumerate(features_best_lcv) if feature_selector.support_[i]]
    
    if overSample:
        eature_names = ['min','max','mean','median','sd','first_quartile','third_quartile']
        features_load = [name+str(i) for i in range(1,6*best_L+1) for name in feature_names]
        features_load.append('activity')
        data_lcv = load_data(train_set, features_load, best_L)
        data_lcv['bending'] = [1 if data.find('bending') != -1 else 0 
                            for data in data_lcv['activity'].to_list()]
        data_lcv = data_lcv.drop(['activity'], axis=1)
        
        bending_1 = data_lcv.loc[data_lcv['bending'] == 1]
        bending_0 = data_lcv.loc[data_lcv['bending'] == 0]
        
        pi = bending_1.shape[0] / (bending_1.shape[0] + bending_0.shape[0])
        pi1 = 1 / 4
        feature_selector.estimator_.intercept_ += pi/(1-pi) + pi1/(1-pi1)

    return selected_features, best_L, lr_model 

In [564]:
def get_data_splits(data, best_L, overSample=False):
    feature_names = ['min','max','mean','median','sd','first_quartile','third_quartile']
    features_load = [name+str(i) for i in range(1,6*best_L+1) for name in feature_names]
    features_load.append('activity')
    
    data_lcv = load_data(data, features_load, best_L)
    data_lcv['bending'] = [1 if data.find('bending') != -1 else 0 
                            for data in data_lcv['activity'].to_list()]
    data_lcv = data_lcv.drop(['activity'], axis=1)
    
    if overSample:
        data_lcv = downsampling(data_lcv)
    
    data_lcv_X = data_lcv.iloc[:,:-1]
    data_lcv_X = data_lcv_X[selected_features]
    data_lcv_Y = data_lcv.iloc[:,-1]
    
    return data_lcv_X, data_lcv_Y

In [565]:
def c_matrix(lr_model, data_X, data_Y):
    lr_model.fit(data_X, data_Y)
    predict_Y = lr_model.predict(data_X)
    matrix = confusion_matrix(data_Y, predict_Y)
    return matrix

In [566]:
def roc_auc(data_X, data_Y, lr_model, data_type):
    #ROC and AUC
    predict_Y_prob_estimates = lr_model.predict_proba(data_X)
    
    #predictY_prob_estimates[:, 1] - indicates getting the positive label estimates
    fpr, tpr, thresholds = roc_curve(data_Y, predict_Y_prob_estimates[:, 1], pos_label=1)
    roc_auc = auc(fpr, tpr)
    
    plt.close()
    plt.figure(figsize=(10,6))
    plt.plot(fpr, tpr, 'b', label="Area Under Curve(AUC): {}".format(roc_auc))
    plt.plot([0,1], [0,1], 'k--')

    plt.title('ROC Curve for {} data'.format(data_type))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    plt.legend()
    plt.grid()
    plt.show()

In [567]:
# lcv_result = pd.DataFrame(columns=['L', 'p','score'])

# for i in tqdm(range(1,21)):
#     lst=[]
#     features_selected, lr_model = logisticRegressionCV(train_set, 5, i)
#     lst.extend([i, features_selected.n_features_, max(features_selected.cv_results_["mean_test_score"])])
#     lst_data = pd.Series(lst, index = ['L', 'p', 'score'])
#     lcv_result = lcv_result.append(lst_data, ignore_index=True)
#     lcv_result['L'] = lcv_result['L'].astype(int)
#     lcv_result['p'] = lcv_result['p'].astype(int)

# lcv_result