In [56]:
import pandas as pd
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
import category_encoders as ce 

from sklearn.impute import SimpleImputer


import warnings

warnings.filterwarnings('ignore')

In [57]:
df = pd.read_csv("income.csv",index_col=None)

Transform the target to binary

In [58]:
df['target'].value_counts()

 <=50K     24720
 <=50K.    12435
 >50K       7841
 >50K.      3846
Name: target, dtype: int64

In [59]:
df['target'] = df['target'].str.replace('.','')

In [60]:
le = preprocessing.LabelEncoder()

In [61]:
df['target']=le.fit_transform(df['target'])

Split the Data

In [62]:
df[df['workclass'].str.contains("?",regex=False)].shape

(2799, 15)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), 
                                                    df['target'], 
                                                    test_size=.2, 
                                                    random_state=42)

Establish lists for different encoding

In [64]:
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
ohe_cols = ['workclass','marital-status','relationship','race','sex']
target_transform_cols = ['education','occupation','native-country']

Impute missing values

In [65]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_most = SimpleImputer(missing_values=" ?", strategy='most_frequent')


In [66]:
X_train[num_cols] = imp_mean.fit_transform(X_train[num_cols])
X_test[num_cols] = imp_mean.transform(X_test[num_cols])

In [67]:
combined = ohe_cols + target_transform_cols

In [68]:
X_train[combined] = imp_most.fit_transform(X_train[combined])
X_test[combined] = imp_most.transform(X_test[combined])

Encode variables

Target Encode

In [69]:
tenc=ce.TargetEncoder() 
df_tenc=tenc.fit_transform(X_train[target_transform_cols],y_train,smoothing=.1)
X_train = df_tenc.join(X_train.drop(target_transform_cols,axis = 1))
df_test = tenc.transform(X_test[target_transform_cols])
X_test = df_test.join(X_test.drop(target_transform_cols,axis = 1))                      

One hot encode

In [70]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[ohe_cols]),columns=encoder.get_feature_names_out(),index=X_train.index)
X_train.drop(ohe_cols,axis = 1,inplace=True)
X_train = X_train_encoded.join(X_train)

X_test_encoded = pd.DataFrame(encoder.transform(X_test[ohe_cols]),columns=encoder.get_feature_names_out(),index=X_test.index)
X_test.drop(ohe_cols,axis = 1,inplace=True)
X_test = X_test_encoded.join(X_test)

Scale and standardize

In [71]:
stanscale = StandardScaler()
X_stanscale =  pd.DataFrame(stanscale.fit_transform(X_train[num_cols]),columns=num_cols,index=X_train.index)
X_train = X_stanscale.join(X_train.drop(num_cols,axis=1))

X_test_stanscale =  pd.DataFrame(stanscale.fit_transform(X_test[num_cols]),columns=num_cols,index=X_test.index)
X_test = X_test_stanscale.join(X_test.drop(num_cols,axis=1))

In [74]:
X_train.to_csv('train_income.csv',index=False)

In [75]:
X_test.to_csv('test_income.csv',index=False)