In [None]:
# Mounting

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/602/3

In [None]:
# Importing Libraries

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
from sklearn.impute import KNNImputer

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.decomposition import PCA

In [None]:
from sklearn.cluster import KMeans

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
file_path = 'apple_watch_data.csv'

In [None]:
df = pd.read_csv(file_path)

In [None]:
# Understanding Data

In [None]:
df.head(5)

In [None]:
print("Dataset Head:")

In [None]:
print(df.head())

In [None]:
print("\nDataset Info:")

In [None]:
print(df.info())

In [None]:
print("\nDescriptive Statistics:")

In [None]:
print(df.describe())

In [None]:
print("\nMissing Values Initially:")

In [None]:
print(df.isnull().sum())

In [None]:
# Cleaning Missing Values

In [None]:
df = df.rename(columns={

In [None]:
    'hear_rate': 'heart_rate',

In [None]:
    'entropy_setps': 'entropy_steps'

In [None]:
})

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)

In [None]:
df[['steps', 'calories', 'distance']] = knn_imputer.fit_transform(df[['steps', 'calories', 'distance']])

In [None]:
df['heart_rate'] = df['heart_rate'].fillna(method='ffill')

In [None]:
print("\nMissing Values After Cleaning:")

In [None]:
print(df.isnull().sum())

In [None]:
# Handling Outliers and Invalid Entries

In [None]:
df['heart_rate'] = pd.to_numeric(df['heart_rate'], errors='coerce')

In [None]:
df.dropna(subset=['heart_rate', 'steps', 'calories', 'distance'], inplace=True)

In [None]:
scaler = RobustScaler()

In [None]:
scaled_columns = ['heart_rate', 'steps', 'calories', 'distance']

In [None]:
df_scaled = scaler.fit_transform(df[scaled_columns])

In [None]:
df_scaled = pd.DataFrame(df_scaled, columns=scaled_columns)

In [None]:
iso_forest = IsolationForest(n_estimators=100, contamination=0.05)

In [None]:
outliers = iso_forest.fit_predict(df_scaled)

In [None]:
df['outlier'] = outliers

In [None]:
df_cleaned = df[df['outlier'] != -1]

In [None]:
# Visualizing Data

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(12, 20))

In [None]:
axes = axes.flatten()

In [None]:
for i, col in enumerate(scaled_columns):

In [None]:
    sns.boxplot(x=df[col], ax=axes[2*i])

In [None]:
    sns.boxplot(x=df_cleaned[col], ax=axes[2*i + 1])

In [None]:
plt.tight_layout()

In [None]:
plt.show()

In [None]:
plt.figure(figsize=(8, 4))

In [None]:
sns.histplot(df['heart_rate'].dropna(), kde=True)

In [None]:
plt.show()

In [None]:
if df['heart_rate'].isnull().any():

In [None]:
    model = LinearRegression()

In [None]:
    train_df = df.dropna(subset=['heart_rate'])

In [None]:
    X_train = train_df[['age', 'weight', 'steps']]

In [None]:
    y_train = train_df['heart_rate']

In [None]:
    model.fit(X_train, y_train)

In [None]:
    missing_values = df[df['heart_rate'].isnull()]

In [None]:
    X_missing = missing_values[['age', 'weight', 'steps']]

In [None]:
    if not X_missing.empty:

In [None]:
        predicted_values = model.predict(X_missing)

In [None]:
        df.loc[df['heart_rate'].isnull(), 'heart_rate'] = predicted_values

In [None]:
plt.figure(figsize=(8, 4))

In [None]:
sns.boxplot(x=df['heart_rate'])

In [None]:
plt.show()

In [None]:
df['heart_rate'] = df['heart_rate'].apply(lambda x: abs(x) if x < 0 else x)

In [None]:
plt.figure(figsize=(8, 4))

In [None]:
sns.boxplot(x=df['heart_rate'])

In [None]:
plt.show()

In [None]:
# Adjust Activity Labels

In [None]:
original_activities = df['activity'].copy()

In [None]:
df['activity'] = df['activity'].replace('Self Pace walk', 'Walking')

In [None]:
def map_activities(activity):

In [None]:
    if activity == "Stnding":

In [None]:
        return np.random.choice(["Lying", "Sitting", "Walking"])

In [None]:
    return activity

In [None]:
df['activity'] = df['activity'].apply(map_activities)

In [None]:
original_freq = original_activities.value_counts()

In [None]:
corrected_freq = df['activity'].value_counts()