In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [19]:
#load data
walk1 = pd.read_csv('./data/walk1.csv')
walk2 = pd.read_csv('./data/walk2.csv')
run = pd.read_csv('./data/run.csv')
stairs = pd.read_csv('./data/stairs.csv')
relaxed = pd.read_csv('./data/relaxed.csv')


In [33]:
def clean_data(df, activity):
    '''
    This function cleans the data by:
    1. Drop columns with all missing values
    2. Convert datetime to date column and time column separately
    3. Drop the original datetime column
    4. Drop Time (s) column followed by dot and a number
    5. Create a new column for activity

    Parameters:
    df: pandas dataframe
    activity: string

    Returns:
    df: cleaned pandas dataframe 
    '''
    df = df.copy()
    # Drop columns with all missing values
    df.dropna(axis=1, how='all', inplace=True)
    # Convert datetime to date column and time column separately
    df['Date'] = pd.to_datetime(df['Actual System Time Text']).dt.date
    df['Time'] = pd.to_datetime(df['Actual System Time Text']).dt.time
    # Drop the original datetime column
    df.drop(columns=['Actual System Time Text', 'Actual System Time', 'Time (s)'], inplace=True)
    # Drop Time (s) column followed by dot and a number
    df = df[df.columns.drop(list(df.filter(regex='Time \(s\)\.\d+')))]
    # Create a new column for activity
    df['Activity'] = activity
    #if activity is walking, drop the Latitude (°)	Longitude (°)	Height (m)	Velocity (m/s)	Direction (°)	Horizontal Accuracy (m)	Vertical Accuracy (m) columns
    if activity == 'walking':
        df.drop(columns=['Latitude (°)', 'Longitude (°)', 'Height (m)', 'Velocity (m/s)', 'Direction (°)', 'Horizontal Accuracy (m)', 'Vertical Accuracy (m)'], inplace=True) 
    return df

  df = df[df.columns.drop(list(df.filter(regex='Time \(s\)\.\d+')))]


In [None]:
walk1_df = clean_data(walk1,'walking')
walk2_df = clean_data(walk2,'walking')
run_df = clean_data(run,'running')
stairs_df = clean_data(stairs,'stairs')
relaxed_df = clean_data(relaxed,'relaxed')

In [None]:
#concatenate all dataframes and sort by date and time
df = pd.concat([walk1_df, walk2_df, run_df, stairs_df, relaxed_df])
df = df.sort_values(by=['Date', 'Time'])
#reset index
df = df.reset_index(drop=True)
#save the cleaned data to a csv file
df.to_csv('./data/all_data.csv', index=False)