# Name : Snehal shyam jagtap

## Assignement No 12


### DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

This assignment aims to equip you with practical skills in data preprocessing, feature engineering, and feature selection techniques, which are crucial for building efficient machine learning models. You will work with a provided dataset to apply various techniques such as scaling, encoding, and feature selection methods including isolation forest and PPS score analysis.

### Task 1. Data Exploration and Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

In [2]:
# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

In [3]:

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [6]:
# Handle missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

In [7]:
# Apply imputation
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])
df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

In [8]:
# Scaling techniques
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

In [9]:
# Standard scaling (useful for normal distributions)
df_standard_scaled = pd.DataFrame(scaler_standard.fit_transform(df[numerical_columns]), columns=numerical_columns)

In [10]:
# Min-Max scaling (useful for uniform distributions)
df_minmax_scaled = pd.DataFrame(scaler_minmax.fit_transform(df[numerical_columns]), columns=numerical_columns)

### Task 2. Encoding Techniques

In [11]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [12]:
# One-Hot Encoding for categorical variables with less than 5 categories
one_hot_columns = [col for col in categorical_columns if df[col].nunique() < 5]
df_one_hot_encoded = pd.get_dummies(df, columns=one_hot_columns)

In [13]:
# Label Encoding for categorical variables with more than 5 categories
label_columns = [col for col in categorical_columns if df[col].nunique() >= 5]
label_encoder = LabelEncoder()

for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])

### Task 3. Feature Engineering

In [14]:
import numpy as np

In [15]:
# Example feature 1: Age groups (young, middle-aged, senior)
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 60, 100], labels=['young', 'middle-aged', 'senior'])

In [16]:
# Example feature 2: Work experience (calculated from age and education)
df['work_experience'] = df['age'] - df['education_num']

In [17]:
# Log transformation for skewed features (e.g., capital-gain)
df['capital_gain_log'] = np.log(df['capital_gain'] + 1)

### Task 4. Feature Selection

In [18]:
from sklearn.ensemble import IsolationForest
from ppscore import score

In [19]:
# Isolation Forest for outlier detection
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(df[numerical_columns])



In [20]:
# Remove outliers
df_no_outliers = df[outliers != -1]

In [21]:
import ppscore as pps

# PPS matrix for the entire dataset
pps_matrix = pps.matrix(df_no_outliers)

# Display the PPS matrix
print(pps_matrix)

                    x                 y   ppscore            case  \
0                 age               age  1.000000  predict_itself   
1                 age         workclass  0.000000      regression   
2                 age            fnlwgt  0.000000      regression   
3                 age         education  0.000000      regression   
4                 age     education_num  0.000000      regression   
..                ...               ...       ...             ...   
319  capital_gain_log    native_country  0.000000      regression   
320  capital_gain_log            income  0.225851  classification   
321  capital_gain_log         age_group  0.000000  classification   
322  capital_gain_log   work_experience  0.000000      regression   
323  capital_gain_log  capital_gain_log  1.000000  predict_itself   

     is_valid_score               metric  baseline_score   model_score  \
0              True                 None        0.000000      1.000000   
1              True  me

In [22]:
# Select only numeric columns for correlation matrix
numeric_columns = df_no_outliers.select_dtypes(include=['int64', 'float64']).columns

In [23]:
# Compute the correlation matrix
correlation_matrix = df_no_outliers[numeric_columns].corr()

In [24]:
# Display the correlation matrix
print(correlation_matrix)


                       age    fnlwgt  education_num  capital_gain  \
age               1.000000 -0.080272       0.031950      0.077290   
fnlwgt           -0.080272  1.000000      -0.042765     -0.017191   
education_num     0.031950 -0.042765       1.000000      0.104008   
capital_gain      0.077290 -0.017191       0.104008      1.000000   
capital_loss      0.009107 -0.027231       0.022770     -0.036722   
hours_per_week    0.094227 -0.017516       0.130770      0.066533   
work_experience   0.982877 -0.071486      -0.152764      0.057248   
capital_gain_log  0.078487 -0.016795       0.078768      0.877722   

                  capital_loss  hours_per_week  work_experience  \
age                   0.009107        0.094227         0.982877   
fnlwgt               -0.027231       -0.017516        -0.071486   
education_num         0.022770        0.130770        -0.152764   
capital_gain         -0.036722        0.066533         0.057248   
capital_loss          1.000000        0.031