In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from scipy.stats import skew

# --- 1. Define Deterministic Data to Match Requirements ---

data = {
    # Age: Mimics Gaussian (Symmetric, Centered)
    'Age': [38, 42, 39, 41, 40, 37, 43, 36, 44, 40, 38, 42, 39, 41, 40],

    #  Income_k: Highly Positively Skewed (Long tail on the right)
    # Most values are low (40-60k), a few are very high (150-500k)
    'Income_k': [45, 50, 55, 60, 48, 52, 150, 65, 50, 42, 58, 47, 500, 70, 40],

    #  Education: Ordinal (Has a natural order)
    'Education': ['PhD', 'Master\'s', 'Bachelor\'s', 'High School', 'Master\'s',
                  'PhD', 'High School', 'Bachelor\'s', 'PhD', 'Master\'s',
                  'Bachelor\'s', 'PhD', 'High School', 'Master\'s', 'Bachelor\'s'],

    # Region: Nominal (No natural order)
    'Region': ['North', 'East', 'West', 'South', 'North',
               'East', 'West', 'South', 'North', 'East',
               'West', 'South', 'North', 'East', 'West']
}
df = pd.DataFrame(data)

print("Original Data Head:")
print(df.head())
print("\n--- Initial Data Properties ---")
print(f"Age Skew: {skew(df['Age']):.2f} (Close to 0 is Gaussian)")
print(f"Income_k Skew: {skew(df['Income_k']):.2f} (High positive skew)")
print("-" * 50)

Original Data Head:
   Age  Income_k    Education Region
0   38        45          PhD  North
1   42        50     Master's   East
2   39        55   Bachelor's   West
3   41        60  High School  South
4   40        48     Master's  North

--- Initial Data Properties ---
Age Skew: 0.00 (Close to 0 is Gaussian)
Income_k Skew: 3.20 (High positive skew)
--------------------------------------------------


In [5]:
#------------------------------------------------Encoding------------------------------------------------------
'''Label encoding : assigns a unique integer to each category of ordinal data '''
# Mapping must reflect the order: High School < Bachelor's < Master's < PhD
education_order = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3}
df['Education_LE'] = df['Education'].map(education_order)

print("Label Encoding (Ordinal) Result:")
print(df[['Education', 'Education_LE']].head())

Label Encoding (Ordinal) Result:
     Education  Education_LE
0          PhD             3
1     Master's             2
2   Bachelor's             1
3  High School             0
4     Master's             2


In [7]:
'''One hot encoding : creates new numerical columns  to each category of nominal data '''
ohe = OneHotEncoder(sparse_output=False, drop='first') # drop='first' is standard practice
region_encoded_array = ohe.fit_transform(df[['Region']])
region_cols = [f'Region_{c}' for c in ohe.categories_[0][1:]]

# Create new DataFrame and concatenate
df_region_encoded = pd.DataFrame(region_encoded_array, columns=region_cols)
df = pd.concat([df.drop('Region', axis=1), df_region_encoded], axis=1)

print("\nOne-Hot Encoding (Nominal) Result:")
print(df[[c for c in df.columns if 'Region_' in c]].head())


One-Hot Encoding (Nominal) Result:
   Region_North  Region_South  Region_West
0           1.0           0.0          0.0
1           0.0           0.0          0.0
2           0.0           0.0          1.0
3           0.0           1.0          0.0
4           1.0           0.0          0.0


In [None]:
#-----------------------------------------Feature Scaling---------------------------------------------------
