# Health Metrics - Data Cleaning

## Setup Packages and Config

In [27]:
import pandas as pd
import numpy as np

## Import Data

In [28]:
# Load the JSON data
data = pd.read_json('data.json')["data"]

# Extract 'workouts' and 'metrics' DataFrames
wdf = pd.DataFrame(data["workouts"])

metrics_list = []
for metric in data["metrics"]:
    df = pd.DataFrame(metric['data'])
    df['metric'] = metric['name']
    df['units'] = metric['units']
    metrics_list.append(df)

# Combine all metrics into a single DataFrame
mdf = pd.concat(metrics_list, ignore_index=True)

## Clean the Data

### Flatten Nested Columns

In [29]:
# Flatten the nested columns in the 'workouts' DataFrame
def extract_qty_column(df, column_name):
    if column_name in df.columns:
        df[f'{column_name}_qty'] = df[column_name].apply(lambda x: x['qty'] if isinstance(x, dict) else x)
    else:
        df[f'{column_name}_qty'] = np.nan
    return df

# Extract the qty from all relvant columns
columnns_to_extract = ['activeEnergyBurned', 'distance', 'lapLength', 'intensity', 'humidity', 'temperature']
for column_name in columnns_to_extract:
    wdf = extract_qty_column(wdf, column_name)
# Drop the original columns
wdf.drop(columns=columnns_to_extract, axis=1, inplace=True)

# Rename 'qty' for clarity
mdf.rename(columns={'qty': 'value'}, inplace=True)


### Convert Dates to DateTime

In [None]:
# Convert Dates to DateTime objects
wdf['start'] = pd.to_datetime(wdf['start'], format='%Y-%m-%d %H:%M:%S %z')
wdf['end'] = pd.to_datetime(wdf['end'], format='%Y-%m-%d %H:%M:%S %z')
mdf['date'] = pd.to_datetime(mdf['date'], format='%Y-%m-%d %H:%M:%S %z')

# Count the number of missing values in each column 
print(wdf.isnull().sum())

## Validate and Save Cleaned Data

In [None]:
# Validate cleaned data
print(wdf.head()) 

# Save the cleaned data
wdf.to_csv('cleaned_workouts.csv', index=False)
mdf.to_csv('cleaned_metrics.csv', index=False)


## Data Analysis

In [None]:
# Group by metric for visualization/analysis
grouped = mdf.groupby('metric')

# Example: Mean value for each metric
mean_values = grouped['value'].mean()
print(mean_values)
