# COS60008 Introduction to Data Science
## Assignment 1, 2025, Semester 1
## Student Details:
* Name: Deval Patel
* Student ID: 105698400
* Email: 105698400@student.swin.edu.au
* Submission Date: 4/11/2025
* Tutelab Class: monday (12:30 to 2:30)

# PART-2

# Data Loading

In [1]:
import pandas as pd
import numpy as np

# Step 1: Load the original dataset
df_original = pd.read_csv("exercise_dataset.csv")




## Making a subset of 200 rows and 8 columns

In [2]:
# Step 2: Subset to 200 rows and 8 columns (4 categorical, 4 numerical)
columns_to_keep = [
    'Exercise', 'Calories Burn', 'Actual Weight', 'Age',
    'Gender', 'Weather Conditions', 'Exercise Intensity', 'Duration'
]
df_subset = df_original.sample(n=200, random_state=42).reset_index(drop=True)
df_unclean = df_subset[columns_to_keep].copy()



## Making data Unclean

In [3]:
# Introduce random missing values (5%)
np.random.seed(42)
for col in df_unclean.columns:
    df_unclean.loc[df_unclean.sample(frac=0.05).index, col] = np.nan



In [4]:
# Function to introduce typos and inconsistent formatting
def introduce_typos(val):
    if isinstance(val, str):
        if np.random.rand() < 0.2:
            return val.lower() + " "
        elif np.random.rand() < 0.2:
            return val.replace("e", "3")
    return val



In [5]:
# Apply typos to categorical columns
for col in ['Exercise', 'Gender', 'Weather Conditions']:
    df_unclean[col] = df_unclean[col].apply(introduce_typos)

# Randomly swap some Duration and Intensity values
swap_indices = np.random.choice(df_unclean.index, size=10, replace=False)
df_unclean.loc[swap_indices, ['Duration', 'Exercise Intensity']] = \
    df_unclean.loc[swap_indices, ['Exercise Intensity', 'Duration']].values



In [6]:
# Save the uncleaned version for Step 3
df_unclean.to_csv("uncleaned_exercise_dataset.csv", index=False)

# Optional: Preview
df_unclean.head()

Unnamed: 0,Exercise,Calories Burn,Actual Weight,Age,Gender,Weather Conditions,Exercise Intensity,Duration
0,Ex3rcis3 8,204.141413,82.013483,46.0,Female,Cloudy,6.0,55.0
1,Exercise 4,274.705653,75.547431,46.0,Male,Cloudy,9.0,
2,Exercise 10,379.26028,85.564708,29.0,Male,Rainy,41.0,1.0
3,Exercise 3,154.854311,67.778809,29.0,female,,10.0,35.0
4,Exercise 8,338.049803,94.511105,39.0,female,Sunny,2.0,25.0


## Loading unclean data-set


In [7]:
# Load the uncleaned dataset
df_unclean = pd.read_csv("uncleaned_exercise_dataset.csv")
df_clean = df_unclean.copy()




# step by step  reconstruction

## Step 1: Strip whitespace and standardize case in categorical columns

In [8]:
for col in ['Exercise', 'Gender', 'Weather Conditions']:
    df_clean[col] = df_clean[col].astype(str).str.strip().str.title()

## Step 2: Fix known typo replacements

In [9]:
df_clean['Exercise'] = df_clean['Exercise'].str.replace(r"Ex3rcis3", "Exercise",case=False, regex=False)
df_clean['Gender'] = df_clean['Gender'].replace({"Female    ": "Female", "female": "Female", "male": "Male"})
df_clean['Weather Conditions'] = df_clean['Weather Conditions'].replace({"Rainy": "Rainy", "rainy": "Rainy"})

## Step 3: Handle missing values

In [10]:
# Fill numerical columns with median
num_cols = ['Calories Burn', 'Actual Weight', 'Age', 'Duration', 'Exercise Intensity']
for col in num_cols:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')  # Ensuring that they stay numeric
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Fill categorical columns with mode
cat_cols = ['Exercise', 'Gender', 'Weather Conditions']
for col in cat_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

## Step 4: Fix swapped values: if Duration is unusually low and Intensity unusually high, swap them

In [11]:
# I'll define a reasonable rule: Duration < 10 and Intensity > 20 is likely swapped
swap_condition = (df_clean['Duration'] < 10) & (df_clean['Exercise Intensity'] > 20)
df_clean.loc[swap_condition, ['Duration', 'Exercise Intensity']] = \
    df_clean.loc[swap_condition, ['Exercise Intensity', 'Duration']].values

## Step 5: Final check for types and formatting

In [12]:
df_clean[num_cols] = df_clean[num_cols].astype(float)
df_clean[cat_cols] = df_clean[cat_cols].astype(str)

## Save the cleaned dataset and preview

In [13]:
df_clean.to_csv("reconstructed_exercise_dataset.csv", index=False)

# Optional: Preview
df_clean.head()

Unnamed: 0,Exercise,Calories Burn,Actual Weight,Age,Gender,Weather Conditions,Exercise Intensity,Duration
0,Exercise 8,204.141413,82.013483,46.0,Female,Cloudy,6.0,55.0
1,Exercise 4,274.705653,75.547431,46.0,Male,Cloudy,9.0,40.0
2,Exercise 10,379.26028,85.564708,29.0,Male,Rainy,1.0,41.0
3,Exercise 3,154.854311,67.778809,29.0,Female,Nan,10.0,35.0
4,Exercise 8,338.049803,94.511105,39.0,Female,Sunny,2.0,25.0
