# Data Exploration and Preprocessing:

In [2]:
# load the data set
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest

In [4]:
df=pd.read_csv(r"C:\Users\salah\OneDrive\Desktop\adult_with_headers.csv")

In [6]:
# Displaying summary statistics
summary_stats = df.describe()
print(summary_stats)

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [8]:
# Checking for missing values
missing_values = df.isnull().sum()
print(missing_values)


age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [10]:
data_types = df.dtypes
print(data_types)

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object


In [12]:
print(df.shape)

(32561, 15)


In [14]:
print(df.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')


In [16]:
# Separate thed  numerical features
numerical_features = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [25]:
# standard scaling
scaler_standard = StandardScaler()
df[numerical_features] = scaler_standard.fit_transform(df[numerical_features])

In [29]:
# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[numerical_features] = scaler_minmax.fit_transform(df[numerical_features])

## Encoding Techniques

In [35]:
# One-Hot Encoding for categorical variables 
categorical_features_onehot = ['workclass', 'education', 'marital_status', 'relationship', 'race', 'sex']
encoder_onehot = OneHotEncoder()
encoded_onehot = encoder_onehot.fit_transform(df[categorical_features_onehot])

In [37]:
# Label Encoding for categorical variables
categorical_features_label = ['occupation', 'native_country']
encoder_label = LabelEncoder()
for feature in categorical_features_label:
    df[feature] = encoder_label.fit_transform(df[feature])

One-Hot Encoding is preferred for nominal data and algorithms sensitive to distance metrics but can lead to high dimensionality.

Label Encoding is simpler and efficient for ordinal data but can introduce misleading ordinal relationships for nominal data.

## Feature Engineering:

In [41]:
# Creating new features
df['capital_diff'] = df['capital_gain'] - df['capital_loss']
df['age_hours_ratio'] = df['age'] / df['hours_per_week']


In [43]:

import numpy as np
df['capital_gain_log'] = np.log(df['capital_gain'] + 1)  # Adding 1 to avoid log(0)

## Feature Selection:

In [45]:
clf = IsolationForest(random_state=42, contamination=0.01)  # Contamination is the proportion of outliers


In [47]:
clf.fit(df[numerical_features])

In [50]:
outliers = clf.predict(df[numerical_features])

In [52]:
# Remove outliers
df_cleaned = df[outliers != -1]


In [54]:
# Print the shape of the cleaned dataset
print("Original dataset shape:", df.shape)
print("Cleaned dataset shape:", df_cleaned.shape)

Original dataset shape: (32561, 18)
Cleaned dataset shape: (32235, 18)


In [56]:
# Compute Pearson correlation coefficient manually
correlation_matrix = df_cleaned[numerical_features].corr()

In [58]:
pps_matrix = correlation_matrix.applymap(lambda x: np.square(abs(x)))

  pps_matrix = correlation_matrix.applymap(lambda x: np.square(abs(x)))


In [60]:
print(pps_matrix)

                     age    fnlwgt  education_num  capital_gain  capital_loss  \
age             1.000000  0.005976       0.000988      0.011704      0.001659   
fnlwgt          0.005976  1.000000       0.001922      0.000067      0.000176   
education_num   0.000988  0.001922       1.000000      0.020163      0.006228   
capital_gain    0.011704  0.000067       0.020163      1.000000      0.002273   
capital_loss    0.001659  0.000176       0.006228      0.002273      1.000000   
hours_per_week  0.005061  0.000467       0.020770      0.006858      0.002087   

                hours_per_week  
age                   0.005061  
fnlwgt                0.000467  
education_num         0.020770  
capital_gain          0.006858  
capital_loss          0.002087  
hours_per_week        1.000000  
