In [26]:
# Import necessary libraries



import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('adult_with_headers.csv')







In [27]:
# column names
print("Column Names:", data.columns)

# Standarding column names to lowercase and replace hyphens with underscores
data.columns = data.columns.str.lower().str.replace('-', '_')


Column Names: Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')


#  Data Exploration and Preprocessing

In [29]:
# Basic Data Exploration
print("Data Shape:", data.shape)
print("Data Types:\n", data.dtypes)
print("Summary Statistics:\n", data.describe(include='all'))
print("Missing Values:\n", data.isnull().sum())

# Handle Missing Values
# Assuming ' ?' represents missing values in the dataset
data.replace(' ?', np.nan, inplace=True)
missing_values = data.isnull().sum()
print("Missing Values after replacement:\n", missing_values)

# Impute missing values for categorical columns with the most frequent value
cat_cols = data.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
data[cat_cols] = imputer.fit_transform(data[cat_cols])

#  Apply Scaling Techniques to Numerical Features
num_cols = data.select_dtypes(include=['int64', 'float64']).columns

# Standard Scaling
scaler_standard = StandardScaler()
data_standard_scaled = data.copy()
data_standard_scaled[num_cols] = scaler_standard.fit_transform(data_standard_scaled[num_cols])

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
data_minmax_scaled = data.copy()
data_minmax_scaled[num_cols] = scaler_minmax.fit_transform(data_minmax_scaled[num_cols])

# Scenarios where each scaling technique is preferred
scaling_discussion = ""

print(scaling_discussion)


Data Shape: (32561, 15)
Data Types:
 age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object
Summary Statistics:
                  age workclass        fnlwgt education  education_num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+

#  Encoding Techniques


In [30]:


#  One-Hot Encoding for categorical variables with less than 5 categories
one_hot_cols = [col for col in cat_cols if data[col].nunique() < 5]
data_one_hot_encoded = pd.get_dummies(data, columns=one_hot_cols, drop_first=True)

#  Label Encoding for categorical variables with more than 5 categories
label_cols = [col for col in cat_cols if data[col].nunique() >= 5]
label_encoder = LabelEncoder()
data_label_encoded = data.copy()
for col in label_cols:
    data_label_encoded[col] = label_encoder.fit_transform(data_label_encoded[col])

# Pros and Cons of Encoding Techniques
encoding_discussion = ""

print(encoding_discussion)





# Feature Engineering


In [41]:


#  New Features
data['hours_per_week_squared'] = data['hours_per_week'] ** 2
data['age_hours_interaction'] = data['age'] * data['hours_per_week']

# Rationale:
# - 'hours_per_week_squared': Polynomial feature which might capture non-linear relationships.
# - 'age_hours_interaction': Interaction feature to capture the combined effect of age and hours worked per week.

# Transformation to Skewed Numerical Feature
# Check skewness
skewed_features = data[num_cols].skew()
print("Skewness of Numerical Features:\n", skewed_features)

# Applying log transformation to 'capital_gain' as an example
data['capital_gain'] = np.log1p(data['capital_gain'])
print("Skewness after log transformation of 'capital_gain':", data['capital_gain'].skew())


Skewness of Numerical Features:
 age               0.558743
fnlwgt            1.446980
education_num    -0.311676
capital_gain      3.096144
capital_loss      4.594629
hours_per_week    0.227643
dtype: float64
Skewness after log transformation of 'capital_gain': 3.0282191326348293
