## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Task 1: Data Exploration and Preprocessing:

## Import Dataset

In [2]:
df = pd.read_csv("adult_with_headers.csv")

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [6]:
# Handle missing values
df.replace(' ?', np.nan, inplace=True)
df.dropna(inplace=True)

In [9]:
#Scaling techniques
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [10]:
#Standard Scaling
scaler = StandardScaler()
df[['age', 'fnlwgt', 'education_num']] = scaler.fit_transform(df[['age', 'fnlwgt', 'education_num']])

In [11]:
# Min -Max Scaling
scaler = MinMaxScaler()
df[['capital_gain', 'capital_loss']] = scaler.fit_transform(df[['capital_gain','capital_loss']])

## Task 2: Encoding Techniques

In [12]:
# One-Hot Encoding
pd.get_dummies(df, columns=['workclass', 'marital_status'])

Unnamed: 0,age,fnlwgt,education,education_num,occupation,relationship,race,sex,capital_gain,capital_loss,...,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,marital_status_ Divorced,marital_status_ Married-AF-spouse,marital_status_ Married-civ-spouse,marital_status_ Married-spouse-absent,marital_status_ Never-married,marital_status_ Separated,marital_status_ Widowed
0,0.042796,-1.062722,Bachelors,1.128918,Adm-clerical,Not-in-family,White,Male,0.021740,0.0,...,False,True,False,False,False,False,False,True,False,False
1,0.880288,-1.007871,Bachelors,1.128918,Exec-managerial,Husband,White,Male,0.000000,0.0,...,True,False,False,False,False,True,False,False,False,False
2,-0.033340,0.244693,HS-grad,-0.439738,Handlers-cleaners,Not-in-family,White,Male,0.000000,0.0,...,False,False,False,True,False,False,False,False,False,False
3,1.108695,0.425240,11th,-1.224066,Handlers-cleaners,Husband,Black,Male,0.000000,0.0,...,False,False,False,False,False,True,False,False,False,False
4,-0.794697,1.406658,Bachelors,1.128918,Prof-specialty,Wife,Black,Female,0.000000,0.0,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.870832,0.638972,Assoc-acdm,0.736754,Tech-support,Wife,White,Female,0.000000,0.0,...,False,False,False,False,False,True,False,False,False,False
32557,0.118931,-0.335252,HS-grad,-0.439738,Machine-op-inspct,Husband,White,Male,0.000000,0.0,...,False,False,False,False,False,True,False,False,False,False
32558,1.489374,-0.358575,HS-grad,-0.439738,Adm-clerical,Unmarried,White,Female,0.000000,0.0,...,False,False,False,False,False,False,False,False,False,True
32559,-1.251511,0.110705,HS-grad,-0.439738,Adm-clerical,Own-child,White,Male,0.000000,0.0,...,False,False,False,False,False,False,False,True,False,False


In [13]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()
df['education'] = le.fit_transform(df['education'])
df['occupation'] = le.fit_transform(df['occupation'])

## Task 3: Feature Engineering

In [15]:
# Create new features
df['age_squared'] = df['age'] ** 2
df['capital_gain_loss_ratio'] = df['capital_gain'] / df['capital_loss']

In [16]:
# Log transformation
df['capital_gain'] = np.log(df['capital_gain'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Task 4: Feature Selection

In [18]:
# Isolation Forest
from sklearn.ensemble import IsolationForest



In [25]:
# Remove outliers
df = df[df('outlier') != -1]

TypeError: 'DataFrame' object is not callable

In [26]:

# Remove outliers
df = df[df['outlier'] != -1]

KeyError: 'outlier'

In [27]:
# Predictive Power Score (PPS)
from sklearn.feature_selection import mutual_info_classif

pps = mutual_info_classif(df.drop('income', axis=1), df['income'])
print(pps)

# Correlation matrix
corr = df.corr()
print(corr)




ValueError: could not convert string to float: ' State-gov'

In [28]:

# Correlation matrix
corr = df.corr()
print(corr)




ValueError: could not convert string to float: ' State-gov'

In [29]:
# Predictive Power Score (PPS)
from sklearn.feature_selection import mutual_info_classif

pps = mutual_info_classif(df.drop('income', axis=1), df['income'])
print(pps)

ValueError: could not convert string to float: ' State-gov'