In [2]:
import pandas as pd
import glob

In [7]:
# Step 1: Load All Files into Python
# Get all CSV files from the directory (adjust the path accordingly)
file_paths = glob.glob("/Users/jenniferposada/Desktop/733/733 project/fitbit_data/heart_rate/heart_rate_*.csv")

# Check if any files were found
if not file_paths:
    raise ValueError("No files found in the specified directory.")

# Load each CSV file into a DataFrame and store them in a list
dataframes = [pd.read_csv(file) for file in file_paths]


In [8]:
# Step 2: Concatenate All DataFrames
# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

In [9]:
# Step 3: Handle Overlapping Data
# Drop duplicates based on the timestamp column to ensure no overlaps
combined_df.drop_duplicates(subset='timestamp', keep='first', inplace=True)

In [10]:
# Step 4: Convert Timestamp to Datetime and Add Month Column
# Assuming 'timestamp' is the name of your time column
# Convert timestamp to datetime format
combined_df['timestamp'] = pd.to_datetime(combined_df['timestamp'], errors='coerce')

In [11]:
# Extract month and year to create a new month column while preserving timezone information
combined_df['month'] = combined_df['timestamp'].dt.strftime('%Y-%m')


In [14]:
# Step 5: Print the first few rows to verify, including the new month column
print("Column Names:", combined_df.columns)
print(combined_df[['timestamp', 'beats per minute', 'month']].head())

Column Names: Index(['timestamp', 'beats per minute', 'month'], dtype='object')
                  timestamp  beats per minute    month
0 2024-06-27 00:00:03+00:00             100.0  2024-06
1 2024-06-27 00:00:08+00:00             101.0  2024-06
2 2024-06-27 00:00:23+00:00             102.0  2024-06
3 2024-06-27 00:00:33+00:00             103.0  2024-06
4 2024-06-27 00:00:38+00:00             104.0  2024-06


In [16]:
# Step 6: Save the Data with Month Column
# Save combined DataFrame with Month column to a new CSV file
combined_df.to_csv("/Users/jenniferposada/Desktop/733/733 project/fitbit_data/heart_rate/combined_heart_rate_with_month.csv", index=False)

print("Timestamp conversion complete. Saved to 'combined_heart_rate_with_month.csv'")


Timestamp conversion complete. Saved to 'combined_heart_rate_with_month.csv'


In [17]:
# Number of rows and columns
print("Number of Rows and Columns:", combined_df.shape)

# Data types of each column
print("Data Types:")
print(combined_df.dtypes)


Number of Rows and Columns: (3531579, 3)
Data Types:
timestamp           datetime64[ns, UTC]
beats per minute                float64
month                            object
dtype: object


In [19]:
summary_stats = combined_df[['beats per minute']].agg(['min', 'max', 'mean', 'median', lambda x: x.quantile(0.75) - x.quantile(0.25)]).rename(index={'<lambda>': 'IQR'})
print("Summary Statistics:\n", summary_stats)


Summary Statistics:
         beats per minute
min            45.000000
max           211.000000
mean           92.868203
median         93.000000
IQR            23.000000


In [20]:
max_value_row = combined_df.loc[combined_df['beats per minute'].idxmax()]
print("Max Heart Rate Value:\n", max_value_row[['timestamp', 'beats per minute']])


Max Heart Rate Value:
 timestamp           2024-03-25 04:53:58+00:00
beats per minute                        211.0
Name: 3184602, dtype: object
