In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


In [None]:
# Create the synthetic dataset
data = {
    'Age': [25, 30, np.nan, 45, 50, 22, 35, 40, 200, 28, 33, np.nan, 55, 60, 180, 70, 42, 38, 29, 31],
    'Income': [50000, 60000, 70000, np.nan, 90000, 40000, 55000, 65000, 1000000, 52000, 61000, 72000, np.nan, 85000, 30000, 95000, 58000, 63000, 54000, 62000],
    'Gender': ['Male', 'Female', 'Others', 'Female', 'Male', 'Others', 'Male', 'Female', 'Male', 'Female', 'Others', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Education': ['Bachelor', np.nan, 'High School', 'Bachelor', 'Master', 'High School', 'Bachelor', 'Master', 'High School', 'Bachelor', 'Master', 'High School', 'Bachelor', 'Master', 'High School', 'Bachelor', 'Master', 'High School', 'Bachelor', 'Master'],
    'Purchased': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]
}

df = pd.DataFrame(data)


In [None]:
df.head()

EDA

In [None]:
df.info()

# find out is 2 string value that need to be converted into numbers so encoding is required

In [None]:
df.describe().T

# max age of a person is 200 means outlier
# also if we see mean so age and income there is so much of difference so we
# need feature scaling as well.

In [None]:
df.shape

In [None]:
df.isnull().sum()
# values are null so we need to perform missing value handling

In [None]:
df['Purchased'].value_counts()

# imbalance treatment

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure(figsize=(10, 4))

# Age boxplot
plt.subplot(1, 2, 1)
plt.boxplot(df['Age'].dropna())
plt.title('Age Boxplot')
plt.ylabel("Age")
plt.xticks([1], ['Age'])

# Income boxplot
plt.subplot(1, 2, 2)
plt.boxplot(df['Income'].dropna())
plt.title('Income Boxplot')
plt.ylabel("Income")
plt.xticks([1], ['Income'])

# Force normal number formatting (remove scientific notation)
plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.gca().yaxis.get_major_formatter().set_scientific(False)


# get current axis
# plt.tight_layout() automatically adjusts the spacing between subplots so everything fits nicely without overlapping.

plt.tight_layout()
plt.show()


In [None]:
# Data preprocessing

In [None]:
# Handling Missing Value

In [None]:
# Handle missing values with mean imputation
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Income'] = df['Income'].fillna(df['Income'].median())
df['Education'] = df['Education'].fillna(df['Education'].mode()[0])

print("Dataset after Handling Missing Values:")
display(df)

In [None]:
# Encoding Categorical Features

In [None]:
# One-hot encoding for Gender
df = pd.get_dummies(df, columns=['Gender'], prefix='Gen', drop_first= True,dtype=int)

# Label encoding for Education
education_map = {
    'High School': 0,
    'Bachelor': 1,
    'Master': 2
}

# Apply mapping
df['Education'] = df['Education'].map(education_map)
print("Dataset after Encoding:")
display(df)

In [None]:
# Outlier handling

In [None]:
def cap_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return np.clip(series, lower_bound, upper_bound)

# Apply to Age and Income
df['Age'] = cap_outliers(df['Age'])
df['Income'] = cap_outliers(df['Income'])

In [None]:
# Again plotting box plot

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure(figsize=(10, 4))

# Age boxplot
plt.subplot(1, 2, 1)
plt.boxplot(df['Age'].dropna())
plt.title('Age Boxplot')
plt.ylabel("Age")
plt.xticks([1], ['Age'])

# Income boxplot
plt.subplot(1, 2, 2)
plt.boxplot(df['Income'].dropna())
plt.title('Income Boxplot')
plt.ylabel("Income")
plt.xticks([1], ['Income'])

# Force normal number formatting (remove scientific notation)
plt.gca().yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.gca().yaxis.get_major_formatter().set_scientific(False)


# get current axis
# plt.tight_layout() automatically adjusts the spacing between subplots so everything fits nicely without overlapping.

plt.tight_layout()
plt.show()


# Splitting Data into train and test

In [None]:
X = df.drop('Purchased', axis=1)
y = df['Purchased']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [None]:
# Feature Scaling

In [None]:
scaler = StandardScaler()
X_train[['Age', 'Income']] = scaler.fit_transform(X_train[['Age', 'Income']])
X_test[['Age', 'Income']] = scaler.transform(X_test[['Age', 'Income']])

In [None]:
# Imbalance Treatment

In [None]:
smote = SMOTE(random_state=42, k_neighbors=2)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

In [None]:
# Model Building