# Experiment 2: Data Cleaning and Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder
from sklearn.impute import SimpleImputer

# Create dataset
data = {
    'age': [35,41,23,32,28,36,45,39,44,29],
    'income': ['70000','90000','50000','60000','','75000','100000','80000','95000','55000'],
    'date':['2020-01-01','2020-01-02','2020-01-03','2020-01-04','2020-01-05','2020-01-06','2020-01-07','','2020-01-09','2020-01-10'],
    'marital_status':['married','single','married','single','married','single','married','single','married','single'],
    'gender':['female','male','male','female','female','male','female','male','male','female']
}
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
df.head()

In [None]:
# Load dataset
df = pd.read_csv('data.csv')
print(df)
print(df.isnull().sum())

In [None]:
# Handle missing values: income with mean, date with forward fill
df['income'] = pd.to_numeric(df['income'], errors='coerce')
df['income'].fillna(df['income'].mean(), inplace=True)
df['date'].fillna(method='ffill', inplace=True)
df

In [None]:
# Drop rows with missing values (if any)
df_cleaned = df.dropna()
df_cleaned

In [None]:
# Remove and Insert Columns
df1 = df_cleaned.drop('gender', axis=1)
df1 = df1.assign(age_squared=lambda x: x['age']**2)
df1.head()

In [None]:
# Rename income to annual_income
df1 = df1.rename(columns={'income': 'annual_income'})
df1.head()

In [None]:
# Feature Scaling using MinMaxScaler and StandardScaler
scaler = MinMaxScaler()
df1['age'] = scaler.fit_transform(df1[['age']])

scaler = StandardScaler()
df1[['annual_income','age_squared']] = scaler.fit_transform(df1[['annual_income','age_squared']])
df1.head()

In [None]:
# Encoding categorical values
le = LabelEncoder()
df[['marital_status','gender']] = df[['marital_status','gender']].apply(le.fit_transform)
df.head()