In [None]:
4# Class to colorize, bold, or underline outpu
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[91m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'
    
#

import pandas as pd
import numpy as np
# define header fro our data,, the UCI (Census Income) dataset does not have a header.
headers = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

# load the data from github repo and convert the values with '?' to NaN
url = "https://raw.githubusercontent.com/vbloise3/whizLabsML/master/CensusIncome/CensusIncomeDataset.csv"
df = pd.read_csv(url, error_bad_lines=False, header=None, names=headers, na_values="null")
# head of the df
df.head(10)


In [None]:
# Separate the features.
feature_df = df.drop('income', axis=1)
# Separate the target
target_df = df.drop(df.columns[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]], axis=1)
feature_df

In [None]:
# What data types are in the dataset
feature_df.dtypes

In [None]:
# Create a dataframe of only the categorical features.
categorical_featuresDF = feature_df.select_dtypes(include=['object']).copy()
categorical_featuresDF

In [None]:
# Find any Null value metrics in the categorical features.
categorical_featuresDF_NaN = categorical_featuresDF[categorical_featuresDF.isnull().any(axis=1)]
categorical_featuresDF_NaN

In [None]:
# Which features have a NaN value?
categorical_featuresDF.columns[categorical_featuresDF.isna().any()].tolist()

In [None]:
# Impute the NaN values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# Impute workclass values.
imputer = imputer.fit(feature_df[['workclass']])
feature_df['workclass'] = imputer.transform(feature_df[['workclass']]).ravel()
# Impute occupation values.
imputer = imputer.fit(feature_df[['occupation']])
feature_df['occupation'] = imputer.transform(feature_df[['occupation']]).ravel()
# Impute native-country values.
imputer = imputer.fit(feature_df[['native-country']])
feature_df['native-country'] = imputer.transform(feature_df[['native-country']]).ravel()

# Recreate the dataframe of only the categorical features.
categorical_featuresDF = feature_df.select_dtypes(include=['object']).copy()

# Recheck to find any null value entries in the categorical features.
categorical_featuresDF_NaN = categorical_featuresDF[categorical_featuresDF.isnull().any(axis=1)]
# Which features have a NaN value?
categorical_featuresDF.columns[categorical_featuresDF.isna().any()].tolist()


In [None]:
# Use binary encoding for the sex feature.
from sklearn.preprocessing import LabelBinarizer
# How many different sex feature value types are there
print(color.BOLD + color.PURPLE + "\nHow many different sex feature types?" + color.END)
print(categorical_featuresDF["sex"].value_counts())

In [None]:
label_style = LabelBinarizer()
label_results = label_style.fit_transform(categorical_featuresDF["sex"])
print(color.BOLD + color.PURPLE + "\nLabelBinarizer of sex feature" + color.END)
categorical_featuresDF["sex_code"] = pd.DataFrame({'sex': label_results[:, 0]})
categorical_featuresDF[["sex", "sex_code"]].head(15)

In [None]:
# Perform label encoding on workclass feature
from sklearn.preprocessing import LabelEncoder
label_work_class = LabelEncoder()
categorical_featuresDF["workclass_code"] = label_work_class.fit_transform(categorical_featuresDF["workclass"])
categorical_featuresDF[["workclass", "workclass_code"]].head(15)

In [None]:
# Use one-hot encoding on the workclass feature
# How many different workclass feature value types
print(color.BOLD + color.PURPLE + "\nHow many different workclass feature types?" + color.END)
print(categorical_featuresDF["workclass"].value_counts())

In [None]:
# One-hot encode the workclass feature
pd.get_dummies(categorical_featuresDF, columns=["workclass"]).head()

In [None]:
# Use one-hot encoding on the marital-status feature

# How many different marital-status feature value types
print(color.BOLD + color.PURPLE + "\nHow many different marital-status feature types?" + color.END)
print(categorical_featuresDF["marital-status"].value_counts())

# One-hot encode the marital-status feature
pd.get_dummies(categorical_featuresDF, columns=["marital-status"]).head()

In [None]:
# Use one-hot encoding on the native-country feature

# How many different native-country feature value types
print(color.BOLD + color.PURPLE + "\nHow many different native-country feature types?" + color.END)
print(categorical_featuresDF["native-country"].value_counts())

# One-hot encode the native-country feature.
pd.get_dummies(categorical_featuresDF, columns=["native-country"]).head()