In [1]:
#numeric: pandas and numpy
import numpy as np
import pandas as pd
# graphics
%matplotlib inline 

import matplotlib.pyplot as plt 
import matplotlib.gridspec as gridspec 
import seaborn as sns

from sklearn import linear_model, svm, preprocessing, tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix, classification_report, f1_score
from matplotlib.colors import ListedColormap

In [2]:
df = pd.read_csv('data/sensoringData_feature_prepared_20_19.0_2.csv',header = 0)

# id is useless
df.drop('id',axis=1,inplace=True)
df.drop('user',axis=1,inplace=True)
df.drop('timestamp',axis=1,inplace=True)

feature_list = list(df.columns[:-2])
print(len(feature_list))
#print(df.head())

# print the number of missing 
#df.isnull().sum()

90


In [3]:
print(df['activity'].unique())

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
#df['activity']= label_encoder.fit_transform(df['activity'])
 
print(df['activity'].unique())

y=df.values[:,-1]
#Y = np.array(y).astype(int)
#print(y)

X=df.values[:,0:-2]
print(f"Features: {len(X[0])}")
print(f"Examples: {len(X)}")

['Walking' 'Inactive' 'Active' 'Driving']
['Walking' 'Inactive' 'Active' 'Driving']
Features: 90
Examples: 499276


In [4]:
labels = np.unique(y)
#print(labels)
quantity = [np.sum(y == label) for label in labels]
#print(quantity)
#print(sum(quantity))

# Creating plot 
fig = plt.figure(figsize =(20, 10)) 
#plt.pie(quantity, labels=labels) 
  
# show plot 
#plt.show()

<Figure size 1440x720 with 0 Axes>

In [5]:
# Separating the data 
dfW=df[df['activity'] == "Walking"]
dfA=df[df['activity'] == "Active"]
dfI=df[df['activity'] == "Inactive"]
dfD=df[df['activity'] == "Driving"]
"""
#Creating the window with 10 subplots.
plt.rcParams.update({'font.size': 8})
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(7,10))
axes = axes.ravel()
#Creating histograms with 50 bins
for idx,ax in enumerate(axes):
    ax.figure
    binwidth= (max(df[feature_list[idx]]) - min(df[feature_list[idx]]))/40
    ax.hist([dfW[feature_list[idx]],dfI[feature_list[idx]],dfA[feature_list[idx]],dfD[feature_list[idx]]], bins=np.arange(min(df[feature_list[idx]]), max(df[feature_list[idx]]) + binwidth, binwidth) , alpha=0.8,stacked=True, density= True, label=['W','I','A','D'],color=['b','g','orange','r'])
    ax.legend(loc='upper right')
    ax.set_title(feature_list[idx])
plt.tight_layout()
plt.show()"""

"\n#Creating the window with 10 subplots.\nplt.rcParams.update({'font.size': 8})\nfig, axes = plt.subplots(nrows=4, ncols=3, figsize=(7,10))\naxes = axes.ravel()\n#Creating histograms with 50 bins\nfor idx,ax in enumerate(axes):\n    ax.figure\n    binwidth= (max(df[feature_list[idx]]) - min(df[feature_list[idx]]))/40\n    ax.hist([dfW[feature_list[idx]],dfI[feature_list[idx]],dfA[feature_list[idx]],dfD[feature_list[idx]]], bins=np.arange(min(df[feature_list[idx]]), max(df[feature_list[idx]]) + binwidth, binwidth) , alpha=0.8,stacked=True, density= True, label=['W','I','A','D'],color=['b','g','orange','r'])\n    ax.legend(loc='upper right')\n    ax.set_title(feature_list[idx])\nplt.tight_layout()\nplt.show()"

In [6]:
# Anova selection

def get_best_x_features(X, y, num_features=50):
    #df = (df - np.min(df))/(np.max(df) - np.min(df))
    
    k_bestfeatures = SelectKBest(score_func = f_classif, k=num_features)
    k_bestfeatures.fit(X, y)
    
    # what are scores for the features
    #for i in range(len(rankings.scores_)):
        #print('Feature %d: %f' % (i, rankings.scores_[i]))
    
    # transform train input data
    X_best = k_bestfeatures.transform(X)
    return X_best
        

In [7]:
X_best = get_best_x_features(X,y,10)
print(f"Features: {len(X_best[0])}")
print(f"Examples: {len(X_best)}")

Features: 10
Examples: 499276


## Train_test_split

#### Extract X and y train in order to evaluate the models

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_best, y, test_size=0.8,random_state=109) # 70% training and 30% test
print("Examples to train: {}".format(len(X_train)))

Examples to train: 99855


# Models Evaluation

#### We use the confusion matrix to evaluate de model because our dataset is unbalanced, so accuracy is not recommended.

### Cross Validation

In [9]:
k_fold = StratifiedKFold(n_splits=4)

In [10]:
# Cross Validation
# Explain f1_score micro

def run_kFold(model, X, y):
    acc_results = []
    f1_results = []
    for train_indices, test_indices in k_fold.split(X,y):
        #print("Train: ",train_indices.shape[0])
        #print("test: ",test_indices.shape[0])
        scaler = StandardScaler()
        
        X_train = X[train_indices]
        y_train = y[train_indices]
        Xs=scaler.fit_transform(X_train)

        X_test = scaler.transform(X[test_indices])
        y_test = y[test_indices]
        
        #Train the model
        model.fit(Xs,y[train_indices])
        #Predict the response for test dataset
        y_pred=model.predict(X_test)

        #matrix = classification_report(y_test,y_pred,labels=["Walking","Active","Inactive","Driving"])
        #print('Classification report : \n',matrix)
        """cm = confusion_matrix(y_test,y_pred)
        print(cm)
        cm_df = pd.DataFrame(cm,
                     index = [0,1,2,3],
                     columns = [0,1,2,3])
                
        #Plotting the confusion matrix
        plt.figure(figsize=(5,4, ))
        sns.heatmap(cm_df, annot=True)
        plt.title('Confusion Matrix')
        plt.ylabel('Actal Values')
        plt.xlabel('Predicted Values')
        plt.show()"""
        
        f1 = f1_score(y_test, y_pred, average='micro')
        f1_results.append(f1)
        #acc = accuracy_score(y_test, y_pred)
        #acc_results.append(acc)
        
    return np.mean(f1_results)
    

### Models Functions

In [11]:
# Test

def plot_decision_regions(X, y, classifier, resolution=0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    #plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
    np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
        alpha=0.8, c=cmap(idx),marker=markers[idx], label=cl)


In [12]:
#Create a linear svm Classifier
def linear_svm():
    #l_svm = svm.SVC(C=1.0,kernel='linear', max_iter=1000, tol=1e-05, verbose=0)
    """l_svm=l_svm.fit(X_best,y)
    plot_decision_regions(X_best, y, classifier=l_svm)
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.legend(loc='upper left')
    plt.grid()
    plt.tight_layout()
    plt.show()"""

    l_svm = svm.SVC(kernel="linear", C=1, gamma=1) # one-vs-all
    
    return run_kFold(l_svm, X_train, y_train)

In [13]:
#Create a RBF svm Classifier
def rbf_svm():
    r_svm = svm.SVC(C=1.0,kernel='rbf', max_iter=100, tol=1e-05, verbose=0)

    #l_svm = svm.SVC(kernel="linear", C=1, gamma=1) # one-vs-all
    
    return run_kFold(r_svm, X_train, y_train)

In [14]:
# Logistic regression
def lr(X_train, y_train):
    regr = linear_model.LogisticRegression()
    
    return run_kFold(regr, X_train, y_train)

In [15]:
# K-Nearest Neighbors
def knn(n_neigh, metric='euclidean'):
    k_n_n = KNeighborsClassifier(n_neighbors=n_neigh, metric=metric)

    return run_kFold(k_n_n, X_train, y_train)

In [16]:
# Decision Tree
def dtree(criterion, max_depth):
    clf = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
    
    return run_kFold(clf, X_train, y_train)

In [17]:
# Random Forest
def rf(n_estimators, max_depth):
    #Create a Gaussian Classifier
    r_forest = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)

    return run_kFold(r_forest, X_train, y_train)

### Decision Tree

In [81]:
# Define our candidate hyperparameters
hp_candidates = {'criterion': ['gini', 'entropy'], 'max_depth': [2,4,8,15,20,25]}

best_hp = {'f1':0, 'candidates': (' ',0)}
for cri in hp_candidates['criterion']:
    for max_ in hp_candidates['max_depth']:
        dt_f1 = dtree(cri, max_)
        print(dt_f1)
        if dt_f1 > best_hp['f1']:
            candidates = (cri, max_)
            best_hp['f1'] = dt_f1
            best_hp['candidates'] = candidates
            
print("Best: ",best_hp)

0.786750782762398
0.9066646553097912
0.9280356318153623
0.9420359549213433
0.9440789220313328
0.9404736904385516
0.7885133693588351
0.9068249133216439
0.9269640819735112
0.9420659973812998
0.9409543750130741
0.9392318826062219
Best:  {'f1': 0.9440789220313328, 'candidates': ('gini', 20)}


### Random Forest

In [19]:
# Define our candidate hyperparameters
hp_candidates = {'n_estimators': [10, 50, 100, 200], 'max_depth': [3,10,80, 90, 110]}

best_hp = {'f1':0, 'candidates': (0,0)}
for est in hp_candidates['n_estimators']:
    for max_ in hp_candidates['max_depth']:
        rf_f1 = rf(est, max_)
        print(rf_f1, est,max_)
        if rf_f1 > best_hp['f1']:
            candidates = (est, max_)
            best_hp['f1'] = rf_f1
            best_hp['candidates'] = candidates
            
print("Best: ",best_hp)

0.9038305830587932 10 3
0.9378098224211764 10 10
0.9545641076272454 10 80
0.9552250581942834 10 90
0.9546141721088341 10 110
0.9110309804737577 50 3
0.9392118477471316 50 10
0.9582394345490002 50 80
0.9576886398021933 50 90
0.958479787868452 50 110
0.9096189535644945 100 3
0.9391617784514962 100 10
0.9585699096319338 100 80
0.958439724970171 100 90
0.9588302901882352 100 110
0.909258431608728 200 3
0.9388913834744289 200 10
0.959030574190677 200 80
0.9587802200902584 200 90
0.9589805085055765 200 110
Best:  {'f1': 0.959030574190677, 'candidates': (200, 80)}


### K-Nearest Neighbors

In [75]:
# Define our candidate hyperparameters
hp_candidates = {'n_neighbors': [3,7], 'metrics': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}

best_hp = {'f1':0, 'candidates': (3,' ')}
for nei in hp_candidates['n_neighbors']:
    for met in hp_candidates['metrics']:
        knn_f1 = knn(nei, met)
        if knn_f1 > best_hp['f1']:
            candidates = (nei, met)
            best_hp['f1'] = knn_f1
            best_hp['candidates'] = candidates
            
print("Best: ",best_hp)

Best:  {'f1': 0.9497871779732483, 'candidates': (3, 'manhattan')}


### Linear SVM 

In [18]:
svm_y_pred = linear_svm()
print("Mean Accuracy:",svm_y_pred)

Train:  74891
test:  24964
0.8530283608396091
Train:  74891
test:  24964
0.8520669764460823
Train:  74891
test:  24964
0.8548710142605351
Train:  74892
test:  24963
0.853743540439851
Mean Accuracy: 0.8534274729965194
