In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import RFECV, RFE
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt

In [101]:
def normConf(conf):
    #outputConf = np.zeros([5,5])
    accsum = 0
    for i in range(len(conf)):
       # for j in range(len(conf[i])):
            #outputConf[i][j] = (conf[i][j])/sum(conf[i])
        accsum += outputConf[i][i]
    acc = accsum/len(conf)
    print("Acc: ", acc)
    return outputConf,acc

def plotConf(conf):
    df_cm = pd.DataFrame(conf, index = [i for i in "ABCDE"],
                  columns = [i for i in "ABCDE"])
    plt.figure(figsize = (10,7))
    ax = plt.axes()
    sn.heatmap(df_cm, annot=True,fmt='g')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    plt.show()
    return

def kcrossfold(X,y):
    #K-fold Cross Validation
    model = tree.DecisionTreeClassifier()
    ypred = model_selection.cross_val_predict(model,X, y, cv=10)
    conf = metrics.confusion_matrix(y, ypred)
    print(metrics.accuracy_score(y,ypred))
    print(metrics.classification_report(y, ypred))
    plotConf(conf)
    return acc


def leaveOneOut(X,y,username):
    #Leave-One-Group-Out Cross Validation
    logo = model_selection.GroupKFold(n_splits=6)
    #print(logo.get_n_splits(groups=username))
    confArray = np.zeros([2,2])
    model = tree.DecisionTreeClassifier()
    ypred = model_selection.cross_val_predict(model,X, y, cv=logo, groups=username)
    conf = metrics.confusion_matrix(y, ypred)
    print(metrics.accuracy_score(y,ypred))
    print(metrics.classification_report(y, ypred))
    plotConf(conf)
    return acc


def experimentFS(X,y,names,cval):
    
    estimator = RandomForestClassifier(n_estimators=100)
    
    if cval == False:
        logo = model_selection.GroupKFold(n_splits=6)
        selector = RFECV(estimator=estimator, step=1, cv=logo,scoring='accuracy')
        selector.fit(X,y,groups=names)
    
    else:
        #10-fold instead of leave one out
        #selector = RFECV(estimator=estimator, step=1, cv=10,scoring='accuracy')
        selector = RFE(estimator=estimator, step=1, n_features_to_select=10)
        selector.fit(X,y)
    
    support = selector.support_
    '''
    scores = selector.grid_scores_

    print("Optimal number of features : %d" % selector.n_features_)
    # Plot number of features VS. cross-validation scores
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
    plt.show()
    '''
    print("--------")
    return support

def plotAcc(acclist, pcntlist):
    plot, ax = plt.subplots(1,1)
    plt.xlabel('Percent of Original Features Used')
    plt.ylabel('Accuracy (%)')
    acclist = [x*100 for x in acclist]
    ax.xaxis.set_ticks(np.arange(0, 20, 1))
    ax.set_xticklabels(pcntlist)
    plt.plot(acclist)
    plt.show()

In [93]:
filename = "normalized_stats.csv"
df = pd.read_csv(filename, header=0)
names = df.user_name
y = df.classe
X = df.loc[:, ~df.columns.isin(['user_name', 'classe'])]

In [72]:
scores,support = experimentFS(X,y,names, cval=False)

Optimal number of features : 47
--------


In [80]:
finalX_logo = X[X.columns[support]]
finalX_logo.insert(loc=0, column='user_name', value=names)
finalX_logo.insert(loc=len(finalX_logo.columns), column='classe', value=y)
finalX_logo.to_csv("fs_stats_logo.csv", index=False, header=True)

In [76]:
scoresCV, supportCV = experimentFS(X,y,names,cval=True)

Optimal number of features : 24
--------


In [81]:
finalX_cv = X[X.columns[supportCV]]
finalX_cv.insert(loc=0, column='user_name', value=names)
finalX_cv.insert(loc=len(finalX_cv.columns), column='classe', value=y)
finalX_cv.to_csv("fs_stats_cv.csv", index=False, header=True)

In [104]:
filename = "normalized_raw.csv"
df = pd.read_csv(filename, header=0)
names = df.user_name
num_window = df.num_window
y = df.classe
X = df.loc[:, ~df.columns.isin(['user_name', 'num_window','classe'])]

In [95]:
raw_scores_logo,raw_support_logo = experimentFS(X,y,names, cval=False)

Optimal number of features : 30
--------


In [110]:
finalX_raw_logo = X[X.columns[raw_support_logo]]
finalX_raw_logo.insert(loc=0, column='user_name', value=names)
finalX_raw_logo.insert(loc=1, column='num_window', value=num_window)
finalX_raw_logo.insert(loc=len(finalX_raw_logo.columns), column='classe', value=y)
finalX_raw_logo.to_csv("fs_raw_logo.csv", index=False, header=True)

In [108]:
estimator = RandomForestClassifier(n_estimators=100)
selector = RFE(estimator=estimator, step=1, n_features_to_select=10)
selector.fit(X,y)
support = selector.support_

In [112]:
finalX_raw_cv = X[X.columns[support]]
finalX_raw_cv.insert(loc=0, column='user_name', value=names)
finalX_raw_cv.insert(loc=1, column='num_window', value=num_window)
finalX_raw_cv.insert(loc=len(finalX_raw_cv.columns), column='classe', value=y)
finalX_raw_cv.to_csv("fs_raw_cv.csv", index=False, header=True)