In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [2]:
#load data
walk1 = pd.read_csv('./data/walk1.csv')
walk2 = pd.read_csv('./data/walk2.csv')
run = pd.read_csv('./data/run.csv')
stairs = pd.read_csv('./data/stairs.csv')
relaxed = pd.read_csv('./data/relaxed.csv')


In [3]:
def clean_data(df, activity):
    '''
    This function cleans the data by:
    1. Drop columns with all missing values
    2. Convert datetime to date column and time column separately
    3. Drop the original datetime column
    4. Drop Time (s) column followed by dot and a number
    5. Create a new column for activity

    Parameters:
    df: pandas dataframe
    activity: string

    Returns:
    df: cleaned pandas dataframe 
    '''
    df = df.copy()
    # Drop columns with all missing values
    df.dropna(axis=1, how='all', inplace=True)
    # Convert datetime to date column and time column separately
    df['Date'] = pd.to_datetime(df['Actual System Time Text']).dt.date
    df['Time'] = pd.to_datetime(df['Actual System Time Text']).dt.time
    # Drop the original datetime column
    df.drop(columns=['Actual System Time Text', 'Actual System Time', 'Time (s)'], inplace=True)
    # Drop Time (s) column followed by dot and a number
    df = df[df.columns.drop(list(df.filter(regex='Time \(s\)\.\d+')))]
    # Create a new column for activity
    df['Activity'] = activity
    #if activity is walking, drop the Latitude (°)	Longitude (°)	Height (m)	Velocity (m/s)	Direction (°)	Horizontal Accuracy (m)	Vertical Accuracy (m) columns
    if activity == 'walking':
        df.drop(columns=['Latitude (°)', 'Longitude (°)', 'Height (m)', 'Velocity (m/s)', 'Direction (°)', 'Horizontal Accuracy (m)', 'Vertical Accuracy (m)'], inplace=True) 
    #if the activity is relaxed change the date to 2024-05-17
    if 'relaxed' in df['Activity'].values:
        new_start_time = pd.Timestamp('2024-05-17 15:34:25.868713140')

        # Combine Date and Time columns into a single datetime column
        df['DateTime'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str))

        # Filter the dataset based on the activity
        df_activity = df[df['Activity'] == activity].copy()

        # Adjust the times
        if not df_activity.empty:
            first_time = df_activity['DateTime'].iloc[0]
            df_activity['DateTime'] = new_start_time + (df_activity['DateTime'] - first_time)

            # Split DateTime back into Date and Time
            df_activity['Date'] = df_activity['DateTime'].dt.date
            df_activity['Time'] = df_activity['DateTime'].dt.time
            df_activity.drop(columns=['DateTime'], inplace=True)

        # Update the original dataframe with the adjusted times
        df.update(df_activity)
        
    return df

  df = df[df.columns.drop(list(df.filter(regex='Time \(s\)\.\d+')))]


In [4]:
walk1_df = clean_data(walk1,'walking')
walk2_df = clean_data(walk2,'walking')
run_df = clean_data(run,'running')
stairs_df = clean_data(stairs,'stairs')
relaxed_df = clean_data(relaxed,'relaxed')

In [5]:
#concatenate all dataframes and sort by date and time
df = pd.concat([walk1_df, walk2_df, run_df, stairs_df, relaxed_df])
#drop the DateTime column
df = df.drop(columns=['DateTime'])
df = df.sort_values(by=['Date', 'Time'])
#reset index
df = df.reset_index(drop=True)
#save the cleaned data to a csv file
df.to_csv('./data/all_data.csv', index=False)

In [6]:
all_data = pd.read_csv('./data/all_data.csv')
all_data.head()

Unnamed: 0,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),Illuminance (lx),Date,Time,Activity
0,-3.141193,2.63841,8.540117,-0.043371,0.065973,0.091019,48.0,2024-05-17,13:59:37.034873,walking
1,-3.337517,2.734178,8.322245,-0.043371,0.065973,0.091019,47.0,2024-05-17,13:59:37.045433,walking
2,-3.445256,2.743755,8.243237,-0.043371,0.065973,0.091019,46.0,2024-05-17,13:59:37.055443,walking
3,-3.545813,2.695871,8.339005,-0.043371,0.065973,0.091019,45.0,2024-05-17,13:59:37.065453,walking
4,-3.591303,2.657564,8.640674,-1.258382,0.047647,0.166155,44.0,2024-05-17,13:59:37.075432,walking


In [7]:
def add_heart_rate(df):
    '''
    Function that adds heart rate data to the dataframe given the activity type.

    Parameters:
    df: pandas dataframe

    Returns:
    df: pandas dataframe with heart rate data added 
    '''
    df = df.copy()
    
    # Convert Date and Time to a single datetime column
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    
    # Extract only the minute part for resampling
    df['Minute'] = df['Datetime'].dt.floor('min')

    # Generate heart rate data for each minute
    heart_rate_data = []
    for minute in df['Minute'].unique():
        activities_in_minute = df[df['Minute'] == minute]['Activity'].unique()
        for activity in activities_in_minute:
            if activity == 'walking':
                heart_rate = np.random.randint(80, 110)
            elif activity == 'running':
                heart_rate = np.random.randint(120, 150)
            elif activity == 'stairs':
                heart_rate = np.random.randint(100, 140)
            elif activity == 'relaxed':
                heart_rate = np.random.randint(60, 90)
            heart_rate_data.append({'Minute': minute, 'Activity': activity, 'Heart Rate (bpm)': heart_rate})

    # Create a DataFrame from the heart rate data
    heart_rate_df = pd.DataFrame(heart_rate_data)

    # Merge the heart rate data back into the original DataFrame
    df = pd.merge(df, heart_rate_df, on=['Minute', 'Activity'], how='left')

    # Set heart rate values to None except for the first record of each minute
    df['Heart Rate (bpm)'] = df.groupby('Minute')['Heart Rate (bpm)'].transform(lambda x: x.where(x.index == x.index[0], None))

    # Drop the Minute column
    df.drop(columns=['Minute'], inplace=True)
    df.drop(columns=['Datetime'], inplace=True)

    return df

In [8]:
all_data = add_heart_rate(all_data)
all_data.head()

Unnamed: 0,Acceleration x (m/s^2),Acceleration y (m/s^2),Acceleration z (m/s^2),Gyroscope x (rad/s),Gyroscope y (rad/s),Gyroscope z (rad/s),Illuminance (lx),Date,Time,Activity,Heart Rate (bpm)
0,-3.141193,2.63841,8.540117,-0.043371,0.065973,0.091019,48.0,2024-05-17,13:59:37.034873,walking,98.0
1,-3.337517,2.734178,8.322245,-0.043371,0.065973,0.091019,47.0,2024-05-17,13:59:37.045433,walking,
2,-3.445256,2.743755,8.243237,-0.043371,0.065973,0.091019,46.0,2024-05-17,13:59:37.055443,walking,
3,-3.545813,2.695871,8.339005,-0.043371,0.065973,0.091019,45.0,2024-05-17,13:59:37.065453,walking,
4,-3.591303,2.657564,8.640674,-1.258382,0.047647,0.166155,44.0,2024-05-17,13:59:37.075432,walking,


In [9]:
#save the data with heart rate to a csv file
all_data.to_csv('./data/final_all_data.csv', index=False)