In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [30]:

class smartwatch:
    def __init__(self, csv_path_file):
        self.df = pd.read_csv(csv_path_file)

    def get_data(self):
        return self.df

    def clean_data(self):
        self.df.drop_duplicates(inplace=True)
        return self.df.isnull().sum(), self.df.dtypes

    def id_clean(self):
        self.df['User ID'].fillna(method='ffill', inplace=True)
        self.df['User ID'] = self.df['User ID'].astype(int)
        return self.df

    def fill_nans(self):
        # Sleep Duration
        self.df['Sleep Duration (hours)'] = pd.to_numeric(self.df['Sleep Duration (hours)'], errors='coerce')
        sleep_mean = self.df['Sleep Duration (hours)'].mean()
        self.df['Sleep Duration (hours)'].fillna(sleep_mean, inplace=True)
        self.df['Sleep Duration (hours)'] = self.df['Sleep Duration (hours)'].round(2)

        # Stress Level
        self.df['Stress Level'] = self.df['Stress Level'].replace('Very High', 10)
        self.df['Stress Level'] = pd.to_numeric(self.df['Stress Level'], errors='coerce')
        stress_mean = self.df['Stress Level'].mean()
        self.df['Stress Level'].fillna(stress_mean, inplace=True)
        self.df['Stress Level'] = self.df['Stress Level'].astype(int)

        # Blood Oxygen Level
        self.df['Blood Oxygen Level (%)'] = pd.to_numeric(self.df['Blood Oxygen Level (%)'], errors='coerce')
        oxygen_mean = self.df['Blood Oxygen Level (%)'].mean()
        self.df['Blood Oxygen Level (%)'].fillna(oxygen_mean, inplace=True)
        self.df['Blood Oxygen Level (%)'] = self.df['Blood Oxygen Level (%)'].round(2)

        # Step Count
        self.df['Step Count'] = pd.to_numeric(self.df['Step Count'], errors='coerce')
        step_mean = self.df['Step Count'].mean()
        self.df['Step Count'].fillna(step_mean, inplace=True)
        self.df['Step Count'] = self.df['Step Count'].astype(int)

        # Heart Rate
        heart_mean = self.df['Heart Rate (BPM)'].mean()
        self.df['Heart Rate (BPM)'].fillna(heart_mean, inplace=True)
        self.df['Heart Rate (BPM)'] = self.df['Heart Rate (BPM)'].round(2)

        # Activity Level
        if 'Activity Level' in self.df.columns:
            activity_mode = self.df['Activity Level'].mode()[0]
            self.df['Activity Level'].fillna(activity_mode, inplace=True)

        return self.df

    def replace_activity_level(self):
        replacement = {
            'Highly_Active': 'Highly Active',
            'Actve': 'Active',
            'Seddentary': 'Sedentary',
            'Moderately_Active': 'Moderately Active'
        }
        self.df['Activity Level'] = self.df['Activity Level'].replace(replacement)
        return self.df

    def data_describe(self):
        return self.df.describe()

    def top_5_highest_step_count_heart_rate(self):
        return self.df.sort_values(by='Step Count', ascending=False).head(5)[['Step Count', 'Heart Rate (BPM)']]

    def most_common_stress_level(self):
        return self.df.groupby('Stress Level').size().sort_values(ascending=False).head(1)

    def analyze_metrics_by_stress(self):
        stress_analysis = self.df.groupby('Stress Level')[[
            'Heart Rate (BPM)',
            'Sleep Duration (hours)',
            'Step Count'
        ]].mean()
        return stress_analysis

    def categorize_sleep_quality(self):
        self.fill_nans()
        bins = [0, 6, 8, float('inf')]
        labels = ['Poor', 'Good', 'Excellent']
        self.df['Sleep Quality'] = pd.cut(self.df['Sleep Duration (hours)'], bins=bins, labels=labels, right=False)
        return self.df[['User ID', 'Sleep Duration (hours)', 'Sleep Quality']].head()

    def analyze_by_sleep_quality(self):
      if 'Sleep Quality' not in self.df.columns:
        self.fill_nans()
        self.categorize_sleep_quality()
        sleep_analysis = self.df.groupby('Sleep Quality')[['Stress Level', 'Step Count']].mean()
        return sleep_analysis


    def outliers_for_heart_rate(self):
        sorted_heart_rate = self.df['Heart Rate (BPM)'].sort_values()
        Q1 = sorted_heart_rate.quantile(0.25)
        Q2 = sorted_heart_rate.median()
        Q3 = sorted_heart_rate.quantile(0.75)
        IQR = Q3 - Q1
        lowerbound = Q1 - 1.5 * IQR
        higherbound = Q3 + 1.5 * IQR
        return f"Lower Bound = {lowerbound:.2f}, Higher Bound = {higherbound:.2f}"

    def outliers_for_Step_Count(self):
        sorted_step_count = self.df['Step Count'].sort_values()
        Q1 = sorted_step_count.quantile(0.25)
        Q2 = sorted_step_count.median()
        Q3 = sorted_step_count.quantile(0.75)
        IQR = Q3 - Q1
        lowerbound = Q1 - 1.5 * IQR
        higherbound = Q3 + 1.5 * IQR
        return f"Lower Bound = {lowerbound:.2f}, Higher Bound = {higherbound:.2f}"

    def plot_distributions(self):
        self.df[['Heart Rate (BPM)', 'Step Count', 'Sleep Duration (hours)']].hist(bins=30, figsize=(15, 5))
        plt.tight_layout()
        plt.show()

    def plot_correlation_heatmap(self):
        plt.figure(figsize=(10, 8))
        correlation_matrix = self.df[['Heart Rate (BPM)', 'Blood Oxygen Level (%)', 'Step Count', 'Sleep Duration (hours)', 'Stress Level']].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
        plt.title('Correlation Matrix of Health Metrics')
        plt.show()

    def plot_health_by_activity(self):
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        sns.boxplot(ax=axes[0], x='Activity Level', y='Stress Level', data=self.df)
        axes[0].set_title('Stress Level by Activity Level')
        axes[0].tick_params(axis='x', rotation=45)

        sns.boxplot(ax=axes[1], x='Activity Level', y='Heart Rate (BPM)', data=self.df)
        axes[1].set_title('Heart Rate by Activity Level')
        axes[1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.show()

    def summary(self):
        return {
            "Shape": self.df.shape,
            "Columns": self.df.columns.tolist(),
            "Missing Values": self.df.isnull().sum().to_dict()
        }






In [31]:
s_w = smartwatch('unclean_smartwatch_health_data.csv')

In [32]:
s_w.get_data()


Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level
0,4174.0,58.939776,98.809650,5450.390578,7.167235622316564,Highly Active,1
1,,,98.532195,727.601610,6.538239375570314,Highly_Active,5
2,1860.0,247.803052,97.052954,2826.521994,ERROR,Highly Active,5
3,2294.0,40.000000,96.894213,13797.338044,7.367789630207228,Actve,3
4,2130.0,61.950165,98.583797,15679.067648,,Highly_Active,6
...,...,...,...,...,...,...,...
9995,1524.0,78.819386,98.931927,2948.491953,7.402748595032027,Active,7
9996,4879.0,48.632659,95.773035,4725.623070,6.3821659358529015,Sedentary,2
9997,2624.0,73.834442,97.945874,2571.492060,6.91654920303435,Sedentary,4
9998,4907.0,,98.401058,3364.788855,5.691233932149209,Active,8


In [33]:
s_w.clean_data()


(User ID                   201
 Heart Rate (BPM)          400
 Blood Oxygen Level (%)    300
 Step Count                100
 Sleep Duration (hours)    150
 Activity Level            200
 Stress Level              200
 dtype: int64,
 User ID                   float64
 Heart Rate (BPM)          float64
 Blood Oxygen Level (%)    float64
 Step Count                float64
 Sleep Duration (hours)     object
 Activity Level             object
 Stress Level               object
 dtype: object)

In [34]:
s_w.id_clean()


Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level
0,4174,58.939776,98.809650,5450.390578,7.167235622316564,Highly Active,1
1,4174,,98.532195,727.601610,6.538239375570314,Highly_Active,5
2,1860,247.803052,97.052954,2826.521994,ERROR,Highly Active,5
3,2294,40.000000,96.894213,13797.338044,7.367789630207228,Actve,3
4,2130,61.950165,98.583797,15679.067648,,Highly_Active,6
...,...,...,...,...,...,...,...
9995,1524,78.819386,98.931927,2948.491953,7.402748595032027,Active,7
9996,4879,48.632659,95.773035,4725.623070,6.3821659358529015,Sedentary,2
9997,2624,73.834442,97.945874,2571.492060,6.91654920303435,Sedentary,4
9998,4907,,98.401058,3364.788855,5.691233932149209,Active,8


In [35]:
s_w.fill_nans()


Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level
0,4174,58.94,98.81,5450,7.17,Highly Active,1
1,4174,76.04,98.53,727,6.54,Highly_Active,5
2,1860,247.80,97.05,2826,6.51,Highly Active,5
3,2294,40.00,96.89,13797,7.37,Actve,3
4,2130,61.95,98.58,15679,6.51,Highly_Active,6
...,...,...,...,...,...,...,...
9995,1524,78.82,98.93,2948,7.40,Active,7
9996,4879,48.63,95.77,4725,6.38,Sedentary,2
9997,2624,73.83,97.95,2571,6.92,Sedentary,4
9998,4907,76.04,98.40,3364,5.69,Active,8


In [36]:
s_w.replace_activity_level()

Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level
0,4174,58.94,98.81,5450,7.17,Highly Active,1
1,4174,76.04,98.53,727,6.54,Highly Active,5
2,1860,247.80,97.05,2826,6.51,Highly Active,5
3,2294,40.00,96.89,13797,7.37,Active,3
4,2130,61.95,98.58,15679,6.51,Highly Active,6
...,...,...,...,...,...,...,...
9995,1524,78.82,98.93,2948,7.40,Active,7
9996,4879,48.63,95.77,4725,6.38,Sedentary,2
9997,2624,73.83,97.95,2571,6.92,Sedentary,4
9998,4907,76.04,98.40,3364,5.69,Active,8


In [37]:
s_w.data_describe()


Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Stress Level
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,3010.0819,76.035662,97.841559,6985.1797,6.505656,5.477
std,1150.808174,19.020246,1.706693,6851.294392,1.479743,2.85403
min,1001.0,40.0,90.79,0.0,-0.19,1.0
25%,2000.75,65.325,96.7,2037.0,5.54,3.0
50%,3000.0,76.04,97.93,5023.5,6.51,5.0
75%,4006.0,84.79,99.33,9679.5,7.48,8.0
max,4999.0,296.59,100.0,62486.0,12.14,10.0


In [38]:
s_w.summary()

{'Shape': (10000, 7),
 'Columns': ['User ID',
  'Heart Rate (BPM)',
  'Blood Oxygen Level (%)',
  'Step Count',
  'Sleep Duration (hours)',
  'Activity Level',
  'Stress Level'],
 'Missing Values': {'User ID': 0,
  'Heart Rate (BPM)': 0,
  'Blood Oxygen Level (%)': 0,
  'Step Count': 0,
  'Sleep Duration (hours)': 0,
  'Activity Level': 0,
  'Stress Level': 0}}

In [39]:
s_w.top_5_highest_step_count_heart_rate()


Unnamed: 0,Step Count,Heart Rate (BPM)
4018,62486,85.36
7967,62040,65.92
3641,61299,73.33
540,60278,89.97
1125,57034,69.32


In [40]:
s_w.most_common_stress_level()

Stress Level
5    1145
dtype: int64

In [41]:
s_w.analyze_metrics_by_stress()


Unnamed: 0_level_0,Heart Rate (BPM),Sleep Duration (hours),Step Count
Stress Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,76.550661,6.543547,6845.678862
2,75.893952,6.515472,6768.682224
3,75.906935,6.484864,7002.670352
4,76.708913,6.431284,7345.742236
5,75.431939,6.531642,6858.624454
6,76.594785,6.495684,7168.544456
7,76.001272,6.523101,6987.21173
8,75.590611,6.475463,6671.331516
9,75.800072,6.51249,6880.183402
10,75.953799,6.534626,7323.329013


In [42]:
s_w.analyze_by_sleep_quality()

Unnamed: 0_level_0,Stress Level,Step Count
Sleep Quality,Unnamed: 1_level_1,Unnamed: 2_level_1
Poor,5.47639,6933.993294
Good,5.487344,7004.055394
Excellent,5.444375,7042.72875


In [43]:
s_w.categorize_sleep_quality()

Unnamed: 0,User ID,Sleep Duration (hours),Sleep Quality
0,4174,7.17,Good
1,4174,6.54,Good
2,1860,6.51,Good
3,2294,7.37,Good
4,2130,6.51,Good


In [44]:
s_w.outliers_for_heart_rate()


'Lower Bound = 36.13, Higher Bound = 113.99'

In [45]:
s_w.outliers_for_Step_Count()

'Lower Bound = -9426.75, Higher Bound = 21143.25'

In [46]:
#s_w.plot_distributions()


In [47]:
#s_w.plot_correlation_heatmap()


In [48]:
#s_w.plot_health_by_activity()

In [49]:
s_w.get_data()

Unnamed: 0,User ID,Heart Rate (BPM),Blood Oxygen Level (%),Step Count,Sleep Duration (hours),Activity Level,Stress Level,Sleep Quality
0,4174,58.94,98.81,5450,7.17,Highly Active,1,Good
1,4174,76.04,98.53,727,6.54,Highly Active,5,Good
2,1860,247.80,97.05,2826,6.51,Highly Active,5,Good
3,2294,40.00,96.89,13797,7.37,Active,3,Good
4,2130,61.95,98.58,15679,6.51,Highly Active,6,Good
...,...,...,...,...,...,...,...,...
9995,1524,78.82,98.93,2948,7.40,Active,7,Good
9996,4879,48.63,95.77,4725,6.38,Sedentary,2,Good
9997,2624,73.83,97.95,2571,6.92,Sedentary,4,Good
9998,4907,76.04,98.40,3364,5.69,Active,8,Poor


In [50]:
import pandas as pd
from sqlalchemy import create_engine

# --- Your PostgreSQL Connection Details ---
db_user = 'postgres'
db_password = 'postgres'
db_host = 'localhost'
db_port = '5432'
db_name = 'postgres'
table_name = 'Cleaned_smart_watch' # The name for your new table

# --- Load the CSV file ---
try:
    df = pd.read_csv('Cleaned_smart_watch.csv')
    print("CSV file loaded successfully.")
except FileNotFoundError:
    print("Error: Cleaned_smart_watch.csv not found.")
    exit()

# --- Create the database connection engine ---
# The connection string format is: "postgresql://user:password@host:port/database"
connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(connection_string)

# --- Upload the DataFrame to PostgreSQL ---
try:
    # This command creates a new table and inserts all data from the DataFrame
    df.to_sql(table_name, engine, if_exists='replace', index=False)
    print(f"Data successfully uploaded to the '{table_name}' table in PostgreSQL.")
except Exception as e:
    print(f"An error occurred: {e}")

CSV file loaded successfully.
Data successfully uploaded to the 'Cleaned_smart_watch' table in PostgreSQL.


In [52]:
s_w.get_data().to_csv('Cleaned_smart_watch.csv',index = 'False')