In [1]:
import pandas as pd
import numpy as np
import datetime,os,sys,time
import matplotlib.pyplot as plt
import scipy as sc
import pandas_profiling
import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.display import display
pd.options.display.max_columns = None

# Lets explore the data

In [2]:
def list_files_2(startpath,form):
    gd=[os.path.join(root,f) for root,dirs,files in os.walk(startpath) for f in files if f.endswith(form)]
    return gd

In [3]:
actual_path = os.getcwd()

In [4]:
files = list_files_2(actual_path,'.dat')
files.sort()
types = np.unique([item.split('/')[-2] for item in files])
print('Files available for this study: ')
counter0,counter1,optional,protocol = [0,0,[],[]]
for item in files:
    if types[0] in item:
        counter0 += 1
        if counter0 == 1:
            print('   >> '+types[0])
        print('       ** '+item)
        if types[0] == 'Optional': optional.append(item)
        if types[0] == 'Protocol': protocol.append(item)
    if types[1] in item:
        counter1 += 1
        if counter1 == 1:
            print('   >> '+types[1])
        print('       ** '+item)
        if types[1] == 'Optional': optional.append(item)
        if types[1] == 'Protocol': protocol.append(item)

Files available for this study: 
   >> Optional
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Optional/subject101.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Optional/subject105.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Optional/subject106.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Optional/subject108.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Optional/subject109.dat
   >> Protocol
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject101.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject102.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject103.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject104.dat
       ** /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject105.dat
       ** /home/ivan/Deskto

In [5]:
IMU = ['Temp (°C)','AX','AY','AZ','AX2','AY2','AZ2','Giros1','Giros2','Giros3','MX','MY','MZ','O1','O2','O3','O4']
cols = ['Time (s)','Activity ID','Heart Rate (bpm)']+['Hand '+item for item in IMU]+['Chest '+item for item in IMU]+['Ankle '+item for item in IMU]

In [6]:
%%time
counter = 0
for item in protocol:
    print('Iteration '+str(counter)+' | File: '+item)
    data = pd.read_csv(item, sep=r'\s{1,}', engine='python', header=None)
    data.columns = cols
    data['User ID'] = int(''.join([e for e in item.split('/')[-1].split('.')[0] if e.isnumeric()]))
    if counter == 0:
        final_data = data
    else:
        final_data = pd.concat([final_data,data])
        final_data = final_data.reset_index(drop=True)
    del data
    counter += 1

Iteration 0 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject101.dat
Iteration 1 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject102.dat
Iteration 2 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject103.dat
Iteration 3 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject104.dat
Iteration 4 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject105.dat
Iteration 5 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject106.dat
Iteration 6 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject107.dat
Iteration 7 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject108.dat
Iteration 8 | File: /home/ivan/Desktop/IRONHACK_DATA_ANALYTICS/Final_project/Protocol/subject109.dat
CPU times: user 2min 13s, sys: 12.8 s, total: 2min 26s
Wall time: 2min 26s


In [None]:
%%time
counter = 0
for item in optional:
    print('Iteration '+str(counter)+' | File: '+item)
    data = pd.read_csv(item, sep=r'\s{1,}', engine='python', header=None)
    data.columns = cols
    data['User ID'] = int(''.join([e for e in item.split('/')[-1].split('.')[0] if e.isnumeric()]))
    if counter == 0:
        final_data_optional = data
    else:
        final_data_optional = pd.concat([final_data_optional,data])
        final_data_optional = final_data_optional.reset_index(drop=True)
    del data
    counter += 1

In [None]:
display(final_data.head())

In [None]:
display(final_data.describe())

In [None]:
display(final_data.dtypes.T)

In [None]:
final_data.isna().sum()

# First Step > DELETE O1, O2 and O3 Attributes

In [None]:
final_cols = [e for e in final_data.columns.tolist() if not 'O' in e]
df = final_data[final_cols]
del final_data

# Lets check the missing data by USER ID in %

In [None]:
counter = 0
missing = []
for item in df['User ID'].unique():
    a = df[df['User ID'] == item]
    print('User ID: ',item)
    b = a.isna().sum()
    if counter == 0:
        ind = b.index.tolist()
    missing.append(np.round(b.values*100/len(a),2))

In [None]:
missing_df_user = pd.DataFrame(np.asarray(missing).T,columns=df['User ID'].unique(),index=ind)
display(missing_df_user)

# Lets check the missing values by Activity ID

In [None]:
counter = 0
missing = []
act_list = df['Activity ID'].unique()
act_list.sort()
for item in act_list:
    a = df[df['Activity ID'] == item]
    print('Activity ID: ',item)
    b = a.isna().sum()
    if counter == 0:
        ind = b.index.tolist()
    missing.append(np.round(b.values*100/len(a),2))

In [None]:
missing_df_act = pd.DataFrame(np.asarray(missing).T,columns=act_list,index=ind)
display(missing_df_act)