In [None]:

# Import required libraries and dependencies
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Load the data into a Pandas DataFrame
df_census_income = pd.read_csv("adult.csv")

# Display sample data
df_census_income.head(10)


Unnamed: 0,Age,Workclass,Final Weight,Education,EducationNum,Marital Status,Occupation,Relationship,Race,Gender,Capital Gain,capital loss,Hours per Week,Native Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [None]:
pd.unique(df_census_income["Workclass"])

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [None]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
df_census_income['Workclass'].value_counts()

Unnamed: 0_level_0,count
Workclass,Unnamed: 1_level_1
Private,22696
Self-emp-not-inc,2541
Local-gov,2093
?,1836
State-gov,1298
Self-emp-inc,1116
Federal-gov,960
Without-pay,14
Never-worked,7


In [None]:
df_census_income['Workclass'] = df_census_income['Workclass'].str.strip()
df_census_income['Workclass'] = df_census_income['Workclass'].str.lower()

In [None]:
# Filter out rows where 'Workclass' is 'never-worked' or 'without-pay'
df_census_income = df_census_income[
    ~df_census_income['Workclass'].isin(['never-worked', 'without-pay'])
]


In [None]:
df_census_income['Workclass'] = df_census_income['Workclass'].replace('?', 'Unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_census_income['Workclass'] = df_census_income['Workclass'].replace('?', 'Unknown')


In [None]:
# Look at APPLICATION_TYPE value counts to identify and replace with "Other"
df_census_income['Workclass'].value_counts()

Unnamed: 0_level_0,count
Workclass,Unnamed: 1_level_1
private,22696
self-emp-not-inc,2541
local-gov,2093
Unknown,1836
state-gov,1298
self-emp-inc,1116
federal-gov,960


In [None]:
df_census_income = df_census_income.drop(columns=['Education','Capital Gain','capital loss'])



df_census_income.head()

Unnamed: 0,Age,Workclass,Final Weight,EducationNum,Marital Status,Occupation,Relationship,Race,Gender,Hours per Week,Native Country,Income
0,39,state-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [None]:
df_census_income['Marital Status'].value_counts()


Unnamed: 0_level_0,count
Marital Status,Unnamed: 1_level_1
Married-civ-spouse,14967
Never-married,10674
Divorced,4442
Separated,1025
Widowed,992
Married-spouse-absent,417
Married-AF-spouse,23


In [None]:
df_census_income['Occupation'] = df_census_income['Occupation'].replace('?', 'Unknown')
df_census_income['Occupation'].value_counts()


Unnamed: 0_level_0,count
Occupation,Unnamed: 1_level_1
Prof-specialty,4140
Craft-repair,4098
Exec-managerial,4066
Adm-clerical,3767
Sales,3650
Other-service,3294
Machine-op-inspct,2001
?,1836
Transport-moving,1596
Handlers-cleaners,1369


In [None]:
df_census_income['Relationship'].value_counts()


Unnamed: 0_level_0,count
Relationship,Unnamed: 1_level_1
Husband,13189
Not-in-family,8304
Own-child,5058
Unmarried,3444
Wife,1564
Other-relative,981


In [None]:
df_census_income['Native Country'].value_counts()

Unnamed: 0_level_0,count
Native Country,Unnamed: 1_level_1
United-States,29150
Mexico,643
?,583
Philippines,197
Germany,137
Canada,121
Puerto-Rico,114
El-Salvador,106
India,100
Cuba,95


In [None]:
# Clean up any whitespace from the column
df_census_income['Native Country'] = df_census_income['Native Country'].str.strip()

# Replace all non-'United-States' values with 'Other'
df_census_income['Native Country'] = df_census_income['Native Country'].apply(
    lambda x: 'Other' if x != 'United-States' else x
)

df_census_income['Native Country'].value_counts()

Unnamed: 0_level_0,count
Native Country,Unnamed: 1_level_1
United-States,29150
Other,3390


In [None]:
df_census_income.head()

Unnamed: 0,Age,Workclass,Final Weight,EducationNum,Marital Status,Occupation,Relationship,Race,Gender,Hours per Week,Native Country,Income
0,39,state-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Other,<=50K


In [None]:
USE SQL OR SPARK

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)