In [1]:
## Imports
import pandas as pd

In [2]:
## Load data
df = pd.read_csv('Data/fall_data.csv') 
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Distance       2039 non-null   float64
 1   Pressure       2039 non-null   float64
 2   HRV            2039 non-null   float64
 3   Sugar level    2039 non-null   float64
 4   SpO2           2039 non-null   float64
 5   Accelerometer  2039 non-null   float64
 6   Decision       2039 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 111.6 KB


Unnamed: 0,Distance,Pressure,HRV,Sugar level,SpO2,Accelerometer,Decision
0,25.54,1.0,101.396,61.08,87.77,1.0,1
1,2.595,2.0,110.19,20.207,65.19,1.0,2
2,68.067,0.0,87.412,79.345,99.345,0.0,0
3,13.09,1.0,92.266,36.18,81.545,1.0,1
4,69.43,0.0,89.48,80.0,99.99,0.0,0


> We can see that the data does not have any missing values. One hot encoded features will be converted to 'int64' dtype.

In [3]:
## Reformat column names
df.columns = df.columns.str.lower()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       2039 non-null   float64
 1   pressure       2039 non-null   float64
 2   hrv            2039 non-null   float64
 3   sugar level    2039 non-null   float64
 4   spo2           2039 non-null   float64
 5   accelerometer  2039 non-null   float64
 6   decision       2039 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 111.6 KB


In [4]:
df.rename(columns={'distance':'distance (cm)','hrv':'hrv (bpm)',
                   'sugar level':'blood sugar level (mg/dL)',
                   'decision ':'decision',}, inplace=True)

In [5]:
df.columns

Index(['distance (cm)', 'pressure', 'hrv (bpm)', 'blood sugar level (mg/dL)',
       'spo2', 'accelerometer', 'decision'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   distance (cm)              2039 non-null   float64
 1   pressure                   2039 non-null   float64
 2   hrv (bpm)                  2039 non-null   float64
 3   blood sugar level (mg/dL)  2039 non-null   float64
 4   spo2                       2039 non-null   float64
 5   accelerometer              2039 non-null   float64
 6   decision                   2039 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 111.6 KB


> There are no missing values in the data set

In [7]:
## Change categorical features dtype from float64 to int
cols_change = {'pressure':'int64', 'accelerometer':'int64'}

In [8]:
df = df.astype(cols_change)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2039 entries, 0 to 2038
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   distance (cm)              2039 non-null   float64
 1   pressure                   2039 non-null   int64  
 2   hrv (bpm)                  2039 non-null   float64
 3   blood sugar level (mg/dL)  2039 non-null   float64
 4   spo2                       2039 non-null   float64
 5   accelerometer              2039 non-null   int64  
 6   decision                   2039 non-null   int64  
dtypes: float64(4), int64(3)
memory usage: 111.6 KB


In [9]:
## Check statistical description for impossible values
df.describe()

Unnamed: 0,distance (cm),pressure,hrv (bpm),blood sugar level (mg/dL),spo2,accelerometer,decision
count,2039.0,2039.0,2039.0,2039.0,2039.0,2039.0,2039.0
mean,28.694527,0.98872,95.657002,72.909243,83.563649,0.661599,0.98872
std,23.773644,0.815918,17.576499,46.94011,11.111592,0.473282,0.815918
min,0.0,0.0,60.0,10.0,60.0,0.0,0.0
25%,7.6425,0.0,82.418,40.23,75.285,0.0,0.0
50%,20.56,1.0,97.238,69.96,85.28,1.0,1.0
75%,55.2055,2.0,109.695,77.6125,92.6925,1.0,2.0
max,69.981,2.0,124.98,179.293,99.99,1.0,2.0


> The blood sugar level (mg/dL) column has a huge range between the max and min values. A reading of 10 mg/dL would be life-threateningly low. This will further be explored in the EDA process.

In [10]:
## Save cleaned dataframe
df.to_csv('Data/clean_fall_data.csv')

# Data Cleaning Summary

- The data was already clean for missing values and inconsistencies.
- Column names were changed to all lowercase and one hot encoded features were changed to 'int64' dtype for ease of work.
- Cleaned dataframe was saved to a .csv file.