In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
#separate train and test train_test_split function' 
def train_test_split_local(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    train_test_split(y, shuffle=False)
    return X_train, X_test, y_train, y_test

def read(file_name, fheader, fuser, ftrial):
    fsadl1 = pd.read_csv(file_name, sep=' ', header=None)
    fdata = fsadl1.iloc[:, :243]
    fdata.columns = fheader
    fdata=fdata[fdata.columns[np.r_[0:45,50:58,63:71,76:84,89:97,102:133]]]
    flabels = fsadl1.iloc[:,243]
    ## Preprocessing data
    #find and remove rows with all nulls
    fidx=fdata.index[fdata.isnull().all(1)] #1 is the axis for rows
    #select data not in idx, that is data that is not all null
    fdata = fdata[~fdata.index.isin(fidx)] 
    #same for labels
    flabels = flabels[~flabels.index.isin(fidx)]

    #see how many there are of each label
    #what does it mean ?
    flabels.value_counts()


    #fill missing values
    fdata = fdata.fillna(method='ffill',axis=1)


    fdata['user'] = fuser
    fdata['trial'] = ftrial
    return fdata, flabels
    
def windowing(fdata, window_number, window_text, porcentage, flabels, frol):
    ffiltered_data = fdata[columns].rolling(frol).median()
    ffiltered_data['MILLISEC'] = fdata.MILLISEC


    # Windowing and Feature Extraction
    ffiltered_data['time']=pd.to_datetime(fdata.MILLISEC,unit='ms')
    ffiltered_data.index=ffiltered_data.time
    #calculate mean over a 1 second window
    keep = ffiltered_data.time.dt.microsecond/window_number %porcentage
    keep = keep - keep.shift() < 0

    means = ffiltered_data[columns].rolling(window_text).mean()[keep]
    means.columns = [str(col) + '_mean' for col in means.columns]
    variances = ffiltered_data[columns].rolling(window_text).var()[keep]
    variances.columns = [str(col) + '_var' for col in variances.columns]

    #talk about apply function
    flabels.index = ffiltered_data.time
    mode_labels = flabels.rolling(window_text).apply(lambda x:mode(x)[0])[keep]

    #all features
    fall_features = pd.concat([means, variances],axis=1)
    fall_features['label'] = mode_labels
    fall_features['user'] = user
    fall_features['trial'] = trial
    
    return fall_features

def plot_confusion_matrix(cm, names, title='MATRIZ DE CONFUSIÓN', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('Clase real')
    plt.xlabel('Clase predicha')


def excercise_1 (fall_data, estimators):
    # Excercise 1
    # Random train test split

    # Labels are the values we want to predict
    labels = np.array(fall_data['label'])

    # Remove the labels from the features
    # axis 1 refers to the columns
    features = fall_data.drop('label', axis = 1)
    features = features.drop('user', axis = 1)
    features = features.drop('trial', axis = 1)


    # Saving feature names for later use
    feature_list = list(features.columns)

    # Convert to numpy array
    features = np.array(features)


    X_train, X_test, y_train, y_test = train_test_split_local(features, labels)

    print('X_train:', X_train.shape)
    print('X_test:', X_test.shape)
    print('y_train:', y_train.shape)
    print('y_test:', y_test.shape)


    classifier = RandomForestClassifier(n_estimators=estimators, random_state=0)  
    classifier.fit(X_train, y_train)  
    y_pred = classifier.predict(X_test)
    
    labels = [0,1,2,4,5]

    print("\n _______________________________________________________")  
    print("confusion matrix: \n")  
    print(confusion_matrix(y_test,y_pred))  
    print("\n _______________________________________________________")  
    print("classification report: \n")  
    print(classification_report(y_test,y_pred))  
    print("_______________________________________________________")  
    print("accuracy score: " + str(accuracy_score(y_test, y_pred)))  

    
    plt.figure(figsize=(15,8))
    plot_confusion_matrix(confusion_matrix(y_test,y_pred), labels)

def excercise_2 (fall_data, estimators):
    # Excercise 2
    # user 1 2 3 train test 4

    # Labels are the values we want to predict
    # Remove the labels from the features
    # axis 1 refers to the columns

    user123 = fall_data[fall_data['user'].isin([1, 2, 3])]
    user4 = fall_data[fall_data['user'].isin([4])]


    y_train_2 = np.array(user123['label'])
    y_test_2 = np.array(user4['label'])

    user123 = user123.drop('label', axis = 1)
    user123 = user123.drop('user', axis = 1)
    user123 = user123.drop('trial', axis = 1)

    user4 = user4.drop('label', axis = 1)
    user4 = user4.drop('user', axis = 1)
    user4 = user4.drop('trial', axis = 1)

    X_train_2 = np.array(user123)
    X_test_2 = np.array(user4)
    print('X_train_2:', X_train_2.shape)
    print('X_test_2:', X_test_2.shape)
    print('y_train_2:', y_train_2.shape)
    print('y_test_2:', y_test_2.shape)


    classifier2 = RandomForestClassifier(n_estimators=estimators, random_state=0)  
    classifier2.fit(X_train_2, y_train_2)  
    y_pred_2 = classifier2.predict(X_test_2)

    print("\n _______________________________________________________")  
    print("confusion matrix: \n")  
    print(confusion_matrix(y_test_2,y_pred_2))  
    print("\n _______________________________________________________")  
    print("classification report: \n")  
    print(classification_report(y_test_2,y_pred_2))  
    print("_______________________________________________________")  
    print("accuracy score: " +str(accuracy_score(y_test_2, y_pred_2)))

    labels = [0,1,2,4,5]
    plt.figure(figsize=(15,8))
    plot_confusion_matrix(confusion_matrix(y_test_2,y_pred_2), labels)

def excercise_3 (fall_data, estimators):  
    # Excercise 3
    #  1,2,3 and drill session as training data and trials 4 and 5 as test data.

    # Labels are the values we want to predict
    # Remove the labels from the features
    # axis 1 refers to the columns

    trial1236 = fall_data[fall_data['trial'].isin([1, 2, 3, 6])]
    trial45 = fall_data[fall_data['trial'].isin([4, 5])]

    y_train_3 = np.array(trial1236['label'])
    y_test_3 = np.array(trial45['label'])

    trial1236 = trial1236.drop('label', axis = 1)
    trial1236 = trial1236.drop('user', axis = 1)
    trial1236 = trial1236.drop('trial', axis = 1)

    trial45 = trial45.drop('label', axis = 1)
    trial45 = trial45.drop('user', axis = 1)
    trial45 = trial45.drop('trial', axis = 1)

    X_train_3 = np.array(trial1236)
    X_test_3 = np.array(trial45)

    print('X_train_3:', X_train_3.shape)
    print('X_test_3:', X_test_3.shape)
    print('y_train_3:', y_train_3.shape)
    print('y_test_3:', y_test_3.shape)


    classifier = RandomForestClassifier(n_estimators=estimators, random_state=0)  
    classifier.fit(X_train_3, y_train_3)  
    y_pred_3 = classifier.predict(X_test_3)


    print("\n _______________________________________________________")  
    print("confusion matrix: \n")  
    print(confusion_matrix(y_test_3,y_pred_3))  
    print("\n _______________________________________________________")  
    print("classification report: \n")  
    print(classification_report(y_test_3,y_pred_3))  
    print("_______________________________________________________")  
    print("accuracy score: " + str(accuracy_score(y_test_3, y_pred_3)))

    labels = [0,1,2,4,5]
    plt.figure(figsize=(15,8))
    plot_confusion_matrix(confusion_matrix(y_test_3,y_pred_3), labels)

In [3]:
#Read data
path = 'dataset/' #enter thepath for the dataset folder
header_path = 'header.csv' #enter the path for the header file
header=pd.read_csv(header_path,names=['column',''])['column'].values
users = range(1,5)
trials = range(1,7)
all_data_1S = pd.DataFrame()
all_data_2S = pd.DataFrame()
all_data_5S = pd.DataFrame()
all_data_10S = pd.DataFrame()

In [None]:

for user in users:
    for trial in trials:
        if trial == 6:
            file_name = path+'S'+str(user)+'-Drill'+'.dat'
        else:
            file_name = path+'S'+str(user)+'-ADL'+str(trial)+'.dat'
        data, labels = read(file_name, header, user, trial)
        columns = data.columns[~data.columns.isin(['user', 'trial','MILLISEC'])]
        #we use a window of 11 elements
        # Filtering using median filter
        
        all_features_1S = windowing(data, 1000, '1S', 500, labels, 11)
        all_data_1S = pd.concat([all_data_1S, all_features_1S])
        
        all_features_2S = windowing(data, 2000, '2S', 1000, labels, 11)
        all_data_2S = pd.concat([all_data_2S, all_features_2S])
        
        all_features_5S = windowing(data, 5000, '5S', 2500, labels, 11)
        all_data_5S = pd.concat([all_data_5S, all_features_5S])
        
        all_features_10S = windowing(data, 10000, '10S', 5000, labels, 11)
        all_data_10S = pd.concat([all_data_10S, all_features_10S])

    

In [None]:
data.describe()

In [None]:
all_data_0S

In [None]:
features_analisys = all_data_1S.filter(['Accelerometer_RKN^_accY_mean',
'Accelerometer_HIP_accY_mean',
'Accelerometer_BACK_accY_mean',
'Accelerometer_RKN__accY_mean',
'InertialMeasurementUnit_BACK_accY_mean',
'InertialMeasurementUnit_BACK_gyroY_mean',
'InertialMeasurementUnit_BACK_magneticY_mean',
'Accelerometer_RKN^_accY_var',
'Accelerometer_HIP_accY_var',
'Accelerometer_BACK_accY_var',
'Accelerometer_RKN__accY_var',
'InertialMeasurementUnit_BACK_accY_var',
'InertialMeasurementUnit_BACK_gyroY_var',
'InertialMeasurementUnit_BACK_magneticY_var'
], axis=1)
corr = features_analisys.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(features_analisys.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(features_analisys.columns)
ax.set_yticklabels(features_analisys.columns)
plt.show()

In [None]:
features_analisys = all_data_1S.filter(['Accelerometer_RKN^_accY_mean',
'Accelerometer_HIP_accY_mean',
'Accelerometer_BACK_accY_mean',
'InertialMeasurementUnit_BACK_accY_mean',
'Accelerometer_RKN^_accY_var',
'Accelerometer_HIP_accY_var',
'Accelerometer_BACK_accY_var',
'InertialMeasurementUnit_BACK_accY_var'
], axis=1)
corr = features_analisys.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(features_analisys.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(features_analisys.columns)
ax.set_yticklabels(features_analisys.columns)
plt.show()

In [None]:
all_data_1S_filtered = all_data_1S.filter(['Accelerometer_RKN^_accY_mean',
'Accelerometer_HIP_accY_mean',
'Accelerometer_BACK_accY_mean',
'InertialMeasurementUnit_BACK_accY_mean',
'Accelerometer_RKN^_accY_var',
'Accelerometer_HIP_accY_var',
'Accelerometer_BACK_accY_var',
'InertialMeasurementUnit_BACK_accY_var',
'label',
'user',
'trial'
], axis=1)

all_data_2S_filtered = all_data_2S.filter(['Accelerometer_RKN^_accY_mean',
'Accelerometer_HIP_accY_mean',
'Accelerometer_BACK_accY_mean',
'InertialMeasurementUnit_BACK_accY_mean',
'Accelerometer_RKN^_accY_var',
'Accelerometer_HIP_accY_var',
'Accelerometer_BACK_accY_var',
'InertialMeasurementUnit_BACK_accY_var',
'label',
'user',
'trial'
], axis=1)

all_data_5S_filtered = all_data_5S.filter(['Accelerometer_RKN^_accY_mean',
'Accelerometer_HIP_accY_mean',
'Accelerometer_BACK_accY_mean',
'InertialMeasurementUnit_BACK_accY_mean',
'Accelerometer_RKN^_accY_var',
'Accelerometer_HIP_accY_var',
'Accelerometer_BACK_accY_var',
'InertialMeasurementUnit_BACK_accY_var',
'label',
'user',
'trial'
], axis=1)

all_data_10S_filtered = all_data_10S.filter(['Accelerometer_RKN^_accY_mean',
'Accelerometer_HIP_accY_mean',
'Accelerometer_BACK_accY_mean',
'InertialMeasurementUnit_BACK_accY_mean',
'Accelerometer_RKN^_accY_var',
'Accelerometer_HIP_accY_var',
'Accelerometer_BACK_accY_var',
'InertialMeasurementUnit_BACK_accY_var',
'label',
'user',
'trial'
], axis=1)

In [None]:
print("\n _______________________________________________________")  
print(all_data_1S_filtered.head())
print("\n _______________________________________________________")  
print(all_data_2S_filtered.head())
print("\n _______________________________________________________")  
print(all_data_5S_filtered.head())
print("\n _______________________________________________________")  
print(all_data_10S_filtered.head())

In [None]:
print("\n _______________________________________________________")  
print(all_data_1S_filtered.label.value_counts())
print("\n _______________________________________________________")  
print(all_data_2S_filtered.label.value_counts())
print("\n _______________________________________________________")  
print(all_data_5S_filtered.label.value_counts())
print("\n _______________________________________________________")  
print(all_data_10S_filtered.label.value_counts())

In [None]:
print("\n _______________________________________________________")  
print(all_data_1S_filtered.trial.value_counts())
print("\n _______________________________________________________")  
print(all_data_2S_filtered.trial.value_counts())
print("\n _______________________________________________________")  
print(all_data_5S_filtered.trial.value_counts())
print("\n _______________________________________________________")  
print(all_data_10S_filtered.trial.value_counts())

In [None]:
print("\n _______________________________________________________")  
print(all_data_1S_filtered.user.value_counts())
print("\n _______________________________________________________")  
print(all_data_2S_filtered.user.value_counts())
print("\n _______________________________________________________")  
print(all_data_5S_filtered.user.value_counts())
print("\n _______________________________________________________")  
print(all_data_10S_filtered.user.value_counts())

In [None]:
#separate by class, see feature mean
all_data_1S_0 = all_data_1S_filtered[all_data_1S_filtered.label==0]
all_data_1S_1 = all_data_1S_filtered[all_data_1S_filtered.label==1]
all_data_1S_2 = all_data_1S_filtered[all_data_1S_filtered.label==2]
all_data_1S_4 = all_data_1S_filtered[all_data_1S_filtered.label==4]
all_data_1S_5 = all_data_1S_filtered[all_data_1S_filtered.label==5]

draw_col = 10
sns.distplot(all_data_1S_0.iloc[:,draw_col], hist=False, kde=True, color='red')
sns.distplot(all_data_1S_1.iloc[:,draw_col], hist=False, kde=True, color='green')
sns.distplot(all_data_1S_2.iloc[:,draw_col], hist=False, kde=True, color='yellow')
sns.distplot(all_data_1S_4.iloc[:,draw_col], hist=False, kde=True, color='blue')
sns.distplot(all_data_1S_5.iloc[:,draw_col], hist=False, kde=True, color='black')

In [None]:
#separate by class, see feature mean
all_data_2S_0 = all_data_2S_filtered[all_data_2S_filtered.label==0]
all_data_2S_1 = all_data_2S_filtered[all_data_2S_filtered.label==1]
all_data_2S_2 = all_data_2S_filtered[all_data_2S_filtered.label==2]
all_data_2S_4 = all_data_2S_filtered[all_data_2S_filtered.label==4]
all_data_2S_5 = all_data_2S_filtered[all_data_2S_filtered.label==5]

draw_col = 10
sns.distplot(all_data_2S_0.iloc[:,draw_col], hist=False, kde=True, color='red')
sns.distplot(all_data_2S_1.iloc[:,draw_col], hist=False, kde=True, color='green')
sns.distplot(all_data_2S_2.iloc[:,draw_col], hist=False, kde=True, color='yellow')
sns.distplot(all_data_2S_4.iloc[:,draw_col], hist=False, kde=True, color='blue')
sns.distplot(all_data_2S_5.iloc[:,draw_col], hist=False, kde=True, color='black')

In [None]:
#separate by class, see feature mean
all_data_5S_0 = all_data_5S_filtered[all_data_5S_filtered.label==0]
all_data_5S_1 = all_data_5S_filtered[all_data_5S_filtered.label==1]
all_data_5S_2 = all_data_5S_filtered[all_data_5S_filtered.label==2]
all_data_5S_4 = all_data_5S_filtered[all_data_5S_filtered.label==4]
all_data_5S_5 = all_data_5S_filtered[all_data_5S_filtered.label==5]

draw_col = 10
sns.distplot(all_data_5S_0.iloc[:,draw_col], hist=False, kde=True, color='red')
sns.distplot(all_data_5S_1.iloc[:,draw_col], hist=False, kde=True, color='green')
sns.distplot(all_data_5S_2.iloc[:,draw_col], hist=False, kde=True, color='yellow')
sns.distplot(all_data_5S_4.iloc[:,draw_col], hist=False, kde=True, color='blue')
sns.distplot(all_data_5S_5.iloc[:,draw_col], hist=False, kde=True, color='black')

In [None]:
#separate by class, see feature mean
all_data_10S_0 = all_data_10S_filtered[all_data_10S_filtered.label==0]
all_data_10S_1 = all_data_10S_filtered[all_data_10S_filtered.label==1]
all_data_10S_2 = all_data_10S_filtered[all_data_10S_filtered.label==2]
all_data_10S_4 = all_data_10S_filtered[all_data_10S_filtered.label==4]
all_data_10S_5 = all_data_10S_filtered[all_data_10S_filtered.label==5]

draw_col = 10
sns.distplot(all_data_10S_0.iloc[:,draw_col], hist=False, kde=True, color='red')
sns.distplot(all_data_10S_1.iloc[:,draw_col], hist=False, kde=True, color='green')
sns.distplot(all_data_10S_2.iloc[:,draw_col], hist=False, kde=True, color='yellow')
sns.distplot(all_data_10S_4.iloc[:,draw_col], hist=False, kde=True, color='blue')
sns.distplot(all_data_10S_5.iloc[:,draw_col], hist=False, kde=True, color='black')

In [None]:
print("#############################")
excercise_1(all_data_1S_filtered, 50)
print("#############################")
excercise_1(all_data_2S_filtered, 50)
print("#############################")
excercise_1(all_data_5S_filtered, 50)
print("#############################")
excercise_1(all_data_10S_filtered, 50)
print("#############################")

In [None]:
print("#############################")
excercise_2(all_data_1S_filtered, 50)
print("#############################")
excercise_2(all_data_2S_filtered, 50)
print("#############################")
excercise_2(all_data_5S_filtered, 50)
print("#############################")
excercise_2(all_data_10S_filtered, 50)
print("#############################")



In [None]:
print("#############################")
excercise_3(all_data_1S_filtered, 50)
print("#############################")
excercise_3(all_data_2S_filtered, 50)
print("#############################")
excercise_3(all_data_5S_filtered, 50)
print("#############################")
excercise_3(all_data_10S_filtered, 50)
print("#############################")

