# EDA

In [5]:
# 1.Data Exploration and Preprocessing
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler

df = pd.read_csv("C:\\Users\\anwes\\OneDrive\\Desktop\\assignment\\EDA2\\EDA2\\adult_with_headers.csv") 
print(df.head())
print(df.info()) 
print(df.describe(include="all"))
print("Missing values:\n", df.isna().sum())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [8]:

num_cols = df.select_dtypes(include=['int64','float64']).columns 
cat_cols = df.select_dtypes(include=['object']).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median()) 
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

scaler_std = StandardScaler() 
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = scaler_std.fit_transform(df[num_cols])

scaler_mm = MinMaxScaler() 
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = scaler_mm.fit_transform(df[num_cols])

print(df_standard_scaled.head()) 
print(df_minmax_scaled.head())

        age          workclass    fnlwgt   education  education_num  \
0  0.030671          State-gov -1.063611   Bachelors       1.134739   
1  0.837109   Self-emp-not-inc -1.008707   Bachelors       1.134739   
2 -0.042642            Private  0.245079     HS-grad      -0.420060   
3  1.057047            Private  0.425801        11th      -1.197459   
4 -0.775768            Private  1.408176   Bachelors       1.134739   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0      0.148453      -0.21

In [13]:
# Encoding Techniques

cat_cols = df.select_dtypes(include="object").columns 
df_encoded = df.copy()

for col in cat_cols:
    unique_count = df[col].nunique()

if unique_count < 5:
    dummies = pd.get_dummies(df_encoded[col], prefix=col)
    df_encoded = pd.concat([df_encoded.drop(col, axis=1), dummies], axis=1) 
else:
    df_encoded[col] = df_encoded[col].astype("category").cat.codes 
print(df_encoded.head())


   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income_ <=50K  \
0          2174             0              40   United-State

In [14]:
# Feature Engineering

df["capital_net"] = df["capital_gain"] - df["capital_loss"] 
df["hours_per_week_per_age"] = df["hours_per_week"] / (df["age"] + 1e-6)

df["age_group"] = pd.cut(df["age"],
bins=[0, 25, 45, 65, 120],
labels=["young_<=25", "adult_26-45", "mid_46-65", "senior_>65"], include_lowest=True)

df["log_capital_gain"] = np.log1p(df["capital_gain"])

print(df.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  \
0          2174             0              40   United-States   <=5

In [18]:
!pip install ppscore


Defaulting to user installation because normal site-packages is not writeable
Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pandas<2.0.0,>=1.0.0
  Downloading pandas-1.5.3-cp39-cp39-win_amd64.whl (10.9 MB)
     ---------------------------------------- 10.9/10.9 MB 2.2 MB/s eta 0:00:00
Building wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py): started
  Building wheel for ppscore (setup.py): finished with status 'done'
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13167 sha256=74da50447e454b39e497fdc121eabdb0cba8022f5575dc4a3b214a71dfd2c657
  Stored in directory: c:\users\anwes\appdata\local\pip\cache\wheels\d8\2d\fc\c1699298a1241684a460b125835f7871ee8e3ab3afea9b5d6f
Successfully built ppscore
Installing collected packages: pandas, ppscore
  Attempting uninstall: pandas
    Found existing installati

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\anwes\\AppData\\Roaming\\Python\\Python39\\site-packages\\~andas.libs\\msvcp140-1a0962f2a91a74c6d7136a768987a591.dll'
Check the permissions.



In [19]:
#Feature Selection (Isolation Forest + PPS)

from sklearn.ensemble import IsolationForest 
import ppscore as pps

df = df.replace("?", np.nan)

num_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist() 
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

for c in num_cols:
    df[c] = df[c].fillna(df[c].median())

for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])

X_num = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std(ddof=0)

iso = IsolationForest(n_estimators=200, contamination=0.01, random_state=42) 
iso.fit(X_num)

df["iforest_pred"] = iso.predict(X_num) 
df_no_outliers = df[df["iforest_pred"] == 1]

pps_raw = pps.matrix(df)
pps_matrix = pps_raw.pivot(index="x", columns="y", values="ppscore").fillna(0)

corr_matrix = df[num_cols].corr() 
print(pps_matrix) 
print(corr_matrix)

ModuleNotFoundError: No module named 'ppscore'