In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import seaborn as sns
import unittest
from unittest import TestCase

# For testing in an ipynb:
# import importlib
# importlib.reload(FitBit)
# Run second line in a cell everytime you make changes to FitBit.py

In [13]:
class FitBit:
    def __init__(self, file_path):
        """
        Initialize FitBit object and load all data.
        
        Parameters
        ----------
        file_path : str
            The path to the folder containing all data collected from FitBit.
            
        Returns
        -------
        FitBit
            An instance of the FitBit class with various DataFrames loaded.
        """

        # Initialize file_path attribute
        self.file_path = file_path

        # Load various types of data into respective attributes
        # Using load_and_concat function to fetch and concatenate data files matching the pattern
        # Each attribute will hold a DataFrame containing the respective type of data

        self.sleep = self.load_and_concat('/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/sleep-*.json')
        self.energy = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/calories-*.json")
        self.steps = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/steps-*.json")
        self.distance = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/distance-*.json")
        self.oxygen = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/estimated_oxygen_variation-*.json")
        self.resting_heart_rate = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/resting_heart_rate-*.json")
        self.heart_rate = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/heart_rate-*.json")
        self.respiration_rate = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/distance-*.json")
        self.sleep_stage = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/sleep-*.json")
        self.floors_climbed = self.load_and_concat("/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/altitude-*.json")
    
    @classmethod
    def load(self, file_path):
        """
        Load data from a specified file path.

        Parameters
        ----------
        file_path : str
            The path to the data file.
        
        Returns
        -------
        pd.DataFrame
            The loaded data as a pandas DataFrame.
        """

        # Update the file path
        self.file_path = file_path

        # Check if the file exists
        if os.path.exists(self.file_path):

            # Get file format (csv or json)
            file = self.file_path.split('/')[-1]
            file_name, file_format = file.split('.')

            # Load data depending on file format
            if file_format == 'csv':
                print(f"CSV data loaded from {self.file_path}")
                return pd.read_csv(self.file_path)
            elif file_format == 'json':
                print(f"JSON data loaded from {self.file_path}")
                return pd.read_json(self.file_path)
            else:
                print(f"Unsupported file format: {file_format}")
        else:
            print(f"The path {self.file_path} does not exist.")
    
    @classmethod
    def load_and_concat(self, pattern):
        """
        Load and concatenate multiple files that match the given file name pattern.
        
        Parameters
        ----------
        pattern : str
            The file name pattern to search for.
        
        Returns
        -------
        pd.DataFrame
            A DataFrame consisting of concatenated data from all matched files.
            Returns an empty DataFrame if no data was found.
        """

        # Get list of all file paths that match the pattern
        file_paths = glob.glob(os.path.join(self.file_path, pattern))
        data_frames = []

        # Loop through all matched file paths
        for file_path in file_paths:
            # Load each file into a DataFrame using the load method
            df = self.load(file_path)
            # Append DataFrame to the list if it's not None
            if df is not None:
                data_frames.append(df)

        # Concatenate all DataFrames if the list is not empty
        if data_frames:
            return pd.concat(data_frames, ignore_index=True)
        else:
            # Return an empty DataFrame if no data was found
            return pd.DataFrame()

In [14]:
#####
# Unit Test
#####

file_path = '/Users/harrisonkane/Desktop/BME/fitbit_dbdp/data'
fitbit= FitBit(file_path)

print(fitbit.heart_rate)

JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/sleep-2023-09-19.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/calories-2023-09-19.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/steps-2023-09-19.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/distance-2023-09-19.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/resting_heart_rate-2023-09-19.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-25.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-24.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data/Global Export Data/heart_rate-2023-09-23.json
JSON data loaded from /Users/harrisonkane/Desktop/BME/fitbit_dbdp/data

In [None]:
class Preprocess:
    def __init__(self,data,column):
        self.data = data
        self.column = column

    @classmethod
    def imputeData(self, method='mean'):
        """
        Impute missing values in the data.
        Args:
            data (pd.DataFrame): Input data containing missing values.
        Returns:
            pd.DataFrame: Data with missing values imputed.
        """

        if method == 'mean':
            self.data[self.column] = self.data[self.column].fillna(self.data[self.column].mean())
        elif method == 'median':
            self.data[self.column] = self.data[self.column].fillna(self.data[self.column].median())
        elif method == 'zero':
            self.data[self.column] = self.data[self.column].fillna(0)
        return self.data

    @classmethod
    def sampleData(self, downsample=True, sample_rate=0):
        """
        Sample the data, optionally downsampling it.
        Args:
            data (pd.DataFrame): Input data for sampling.
            downsample (bool): Whether to downsample the data.
        Returns:
            pd.DataFrame: Sampled data.
        """
        
        if downsample == True:
            sampled_data = self.data.sample(frac=sample_rate, random_state=1)
        else: 
            if not isinstance(self.data.index, pd.DatetimeIndex):
                self.data.index = pd.to_datetime(self.data.index)
            data_index = self.data.index
            sampled_data = self.data.resample(rule=str(sample_rate)+'D').asfreq()
            sampled_data.interpolate(method='linear', inplace=True)
            sampled_data = sampled_data.reindex(data_index, method='nearest')
        return sampled_data

    @classmethod
    def covertTime(self, time_col, timezone='UTC'):
        """
        Convert a specific column in the data to a time format.
        Args:
            data (pd.DataFrame): Input data containing time data.
            time_col (str): Name of the time column to convert.
        Returns:
            pd.DataFrame: Data with the time column converted.
        """
        # FOR FUTURE: with standard format, shouldn't need to ask for time_col
        # As all DataFrames will have same name for time column
        # Also won't need to convert to datetime because will already be converted

        # Ensure the specified column exists in the DataFrame
        if time_col not in self.data.columns:
            raise ValueError(f"Column '{time_col}' not found in the DataFrame.")

        # Convert column to DateTime format (although it already should be in DateTime format)
        self.data[time_col] = pd.to_datetime(self.data[time_col], errors='coerce')

        # Apply the specified timezone (default is 'UTC')
        self.data[time_col] = self.data[time_col].dt.tz_localize(timezone)

        return self.data


In [None]:
#####
# Unit Test
#####

data = pd.DataFrame({'A': [1, 2, None, 4, 5], 'B': [5, 4, 3, None, 1]})
preprocessor = Preprocess(data, 'A')

assert preprocessor.imputeData('mean').isnull().sum().sum(), 0
assert preprocessor.imputeData('median').isnull().sum().sum(), 0
assert preprocessor.imputeData('zero').isnull().sum().sum(), 0

# Test sampleData method for downsampling
downsampled_data = preprocessor.sampleData(downsample=True, sample_rate=0.5)
assert downsampled_data.shape[0] < data.shape[0]

# # Test sampleData method for upsampling
upsampled_data = preprocessor.sampleData(downsample=False, sample_rate=2)
upsampled_data
# assert upsampled_data.shape[0] > data.shape[0]


In [None]:
class EDA:
    def __init__(self,data):
        ## Assume it is already processed
        self.data = data
        self.interval = None
        self.mean = None
        self.median = None
        self.std = None
    
    @classmethod
    def describeData(self):
        self.mean = np.mean(self.data)
        self.median = np.median(self.data)
        self.interval = np.max(self.data) - np.min(self.data)
        self.std = np.std(self.data)
        print(f"Mean: {self.mean:.5f}")
        print(f"Median: {self.median:.5f}")
        print(f"Interval: {self.interval:.5f}")
        print(f"Standard Deviation: {self.std:.5f}")

    @classmethod
    def dataDistribution(self):
        plt.figure(figsize=(8, 6))
        sns.distplot(self.data, bins=30, kde=False, color='blue')
        plt.title('Data Distribution Plot')
        plt.xlabel('Values')
        plt.ylabel('Frequency')
        plt.show()
    
    @classmethod
    def detectOutliers(self):
        iqr = np.percentile(self.data, 75) - np.percentile(self.data, 25)
        lower_bound = np.percentile(self.data, 25) - 1.5 * iqr
        upper_bound = np.percentile(self.data, 75) + 1.5 * iqr
        outliers = [x for x in self.data if x < lower_bound or x > upper_bound]
        print("Outliers:", outliers)
    
    @classmethod
    def correlationAnalysis(self,other_data):
    # Add code to perform correlation analysis
        pass
    

In [None]:
#####
# Unit Test
#####

data = np.random.normal(0, 1, 1000)
eda = EDA(data)
eda.describeData()
eda.dataDistribution()
eda.detectOutliers()