# 1.0 An end-to-end classification problem



## 1.1 Dataset description



We'll be looking at individual income in the United States. The **data** is from the **1994 census**, and contains information on an individual's **marital status**, **age**, **type of work**, and more. The **target column**, or what we want to predict, is whether individuals make less than or equal to 50k a year, or more than **50k a year**.

You can download the data from the [University of California, Irvine's website](http://archive.ics.uci.edu/ml/datasets/Adult).




## 1.2 Load Libraries, Train and Validation Sets

In [None]:
!pip install wandb

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import wandb
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

from sklearn import set_config
set_config(display='diagram')

In [None]:
!wandb login --relogin

In [None]:
# save_code tracking all changes of the notebook and sync with Wandb
run = wandb.init(project="Week08_Example_02")

In [None]:
local_path = run.use_artifact("week_07_data_segregation/train_data.csv:latest").file()
df_train = pd.read_csv(local_path)

In [None]:
df_train.head()

## 1.3 Train and Dev split

In [None]:
# split-out train/validation and test dataset
x_train, x_val, y_train, y_val = train_test_split(df_train.drop(labels="high_income",axis=1),
                                                    df_train["high_income"],
                                                    test_size=0.30,
                                                    random_state=41,
                                                    shuffle=True,
                                                    stratify=df_train["high_income"])

In [None]:
print("x train: {}".format(x_train.shape))
print("y train: {}".format(y_train.shape))
print("x val: {}".format(x_val.shape))
print("y val: {}".format(y_val.shape))

## 1.4 Removal Outliers

In [None]:
# Verify if columns[int64] has outliers (without data leakage!!!!!!!)

# data
x = x_train.select_dtypes("int64").copy()

# identify outlier in the dataset
lof = LocalOutlierFactor()
outlier = lof.fit_predict(x)
mask = outlier != -1

print("X_train shape [original]: {}".format(x_train.shape))
print("X_train shape [outlier removal]: {}".format(x_train.loc[mask,:].shape))

# income with outliner
x_train = x_train.loc[mask,:].copy()
y_train = y_train[mask].copy()

## 1.5 Encoding target variable

If a categorical target variable needs to be encoded for a classification predictive modeling problem, then the [LabelEncoder class](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) can be used.

In [None]:
# define a categorical encoding for target variable
le = LabelEncoder()

# fit and transoform y_train
y_train = le.fit_transform(y_train)

# transform y_test (avoiding data leakage)
y_val = le.transform(y_val)

print("Classes: {}".format(le.classes_))

In [None]:
# just in case you need the inverse transformation
le.inverse_transform([0, 1])

In [None]:
# sampling of transformed target variable
print(y_train[:5],y_val[-6:-1])

## 1.6 Pipeline 

### 1.6.1 Column extractor

In [None]:
#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self.feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self.feature_names ]

### 1.6.2 Categorical transformation

In [None]:
# Handling categorical features 
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
  # Class constructor method that takes one boolean as its argument
  def __init__(self, new_features=True):
    self.new_features = new_features
    self.colnames = None

  #Return self nothing else to do here    
  def fit( self, X, y = None ):
    return self 

  def get_feature_names(self):
        return self.colnames.tolist()

  # Transformer method we wrote for this transformer 
  def transform(self, X , y = None ):
    df = X.copy()

    # customize feature?
    # how can I identify this one? EDA!!!!
    if self.new_features: 
      
      # minimize the cardinality of native_country feature
      df.loc[df['native_country']!=' United-States','native_country'] = 'non_usa' 

      # replace ? with Unknown
      edit_cols = ['native_country','occupation','workclass']
      for col in edit_cols:
        df.loc[df[col] == ' ?', col] = 'unknown'

      # decrease the cardinality of education feature
      hs_grad = [' HS-grad',' 11th',' 10th',' 9th',' 12th']
      elementary = [' 1st-4th',' 5th-6th',' 7th-8th']
      # replace
      df['education'].replace(to_replace = hs_grad,value = 'HS-grad',inplace = True)
      df['education'].replace(to_replace = elementary,value = 'elementary_school',inplace = True)

      # adjust marital_status feature
      married= [' Married-spouse-absent',' Married-civ-spouse',' Married-AF-spouse']
      separated = [' Separated',' Divorced']
      # replace 
      df['marital_status'].replace(to_replace = married ,value = 'Married',inplace = True)
      df['marital_status'].replace(to_replace = separated,value = 'Separated',inplace = True)

      # adjust workclass feature
      self_employed = [' Self-emp-not-inc',' Self-emp-inc']
      govt_employees = [' Local-gov',' State-gov',' Federal-gov']
      # replace elements in list.
      df['workclass'].replace(to_replace = self_employed ,value = 'Self_employed',inplace = True)
      df['workclass'].replace(to_replace = govt_employees,value = 'Govt_employees',inplace = True)

    # update column names
    self.colnames = df.columns      
  
    return df

#### 1.6.2.1 Evaluate

In [None]:
# 
# for validation purposes
#
model = FeatureSelector(x_train.select_dtypes("object").columns.to_list())
df = model.fit_transform(x_train)
df.head()

In [None]:
# 
# for validation purposes
#
model = CategoricalTransformer(new_features=True)
df_cat = model.fit_transform(df)
df_cat.head()

In [None]:
# check the cardinality before and after transformation
x_train.select_dtypes("object").apply(pd.Series.nunique)

In [None]:
# check the cardinality before and after transformation
df_cat.apply(pd.Series.nunique)

### 1.6.3 Numerical transformation

In [None]:
# transform numerical features
class NumericalTransformer( BaseEstimator, TransformerMixin ):
  # Class constructor method that takes a model parameter as its argument
  # model 0: minmax
  # model 1: standard
  # model 2: without scaler
  def __init__(self, model = 0):
    self.model = model
    self.colnames = None

  #Return self nothing else to do here    
  def fit( self, X, y = None ):
    return self

  # return columns names after transformation
  def get_feature_names(self):
        return self.colnames 
        
  #Transformer method we wrote for this transformer 
  def transform(self, X , y = None ):
    df = X.copy()
    
    # update columns name
    self.colnames = df.columns.tolist()
    
    # minmax
    if self.model == 0: 
      scaler = MinMaxScaler()
      # transform data
      df = scaler.fit_transform(df)
    elif self.model == 1:
      scaler = StandardScaler()
      # transform data
      df = scaler.fit_transform(df)
    else:
      df = df.values

    return df

#### 1.6.3.1 Evaluate

In [None]:
# 
# for validation purposes
#
model = FeatureSelector(x_train.select_dtypes("int64").columns.to_list())
df = model.fit_transform(x_train)
df.head()

In [None]:
# 
# for validation purposes
# 
# model 0: minmax
# model 1: standard
# model 2: without scaler
#
model = NumericalTransformer(model=1)
df_cat = model.fit_transform(df)
df_cat

### 1.6.4 Pipeline union (cat + num)

In [None]:
# Categrical features to pass down the categorical pipeline 
categorical_features = x_train.select_dtypes("object").columns.to_list()

# Numerical features to pass down the numerical pipeline 
numerical_features = x_train.select_dtypes("int64").columns.to_list()

# Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline(steps = [('cat_selector', FeatureSelector(categorical_features)),
                                         ('cat_transformer', CategoricalTransformer()),
                                         #('cat_encoder','passthrough')
                                         ('cat_encoder',OneHotEncoder(sparse=False,drop="first"))
                                         ]
                                )

# Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline(steps = [('num_selector', FeatureSelector(numerical_features)),
                                       ('num_transformer', NumericalTransformer()) 
                                       ]
                              )

# Combining numerical and categorical piepline into one full big pipeline horizontally 
# using FeatureUnion
full_pipeline_preprocessing = FeatureUnion(transformer_list = [('cat_pipeline', categorical_pipeline),
                                                               ('num_pipeline', numerical_pipeline)
                                                               ]
                                           )

#### 1.6.4.1 Evaluate

In [None]:
# 
# for validate purposes
#
new_data = full_pipeline_preprocessing.fit_transform(x_train)
catnames = full_pipeline_preprocessing.get_params()["cat_pipeline"][2].get_feature_names_out().tolist()
numnames = full_pipeline_preprocessing.get_params()["num_pipeline"][1].get_feature_names()
df = pd.DataFrame(new_data,columns = catnames + numnames)
df.head()

In [None]:
df.shape

## 1.7 Modeling and Training

In [None]:
# The full pipeline 
pipe = Pipeline(steps = [('full_pipeline', full_pipeline_preprocessing),
                         ("classifier",DecisionTreeClassifier())])

# training 
pipe.fit(x_train,y_train)

# final model
predict = pipe.predict(x_val)

In [None]:
# confusion matrix (we change the way to make equal to slides)
#             true label
#               1     0     
# predict  1    TP    FP
#          0    FN    TN
#

confusion_matrix(predict,y_val,
                 labels=[1,0])

In [None]:
print(accuracy_score(y_val, predict))
print(classification_report(y_val,predict))

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,4))

ConfusionMatrixDisplay(confusion_matrix(predict,y_val,labels=[1,0]),
                       display_labels=[">50k","<=50k"],).plot(values_format=".0f",ax=ax)

ax.set_xlabel("True Label")
ax.set_ylabel("Predicted Label")
plt.show()

In [None]:
roc_auc_score(y_val, predict, average="macro")

In [None]:
# full pipeline
features_full = pipe.named_steps['full_pipeline']

# get columns names from categorial columns
features_cat = features_full.get_params()["cat_pipeline"]
features_cat = features_cat[2].get_feature_names_out().tolist()
features_cat

In [None]:
# get columns names from numerical columns
features_num = features_full.get_params()["num_pipeline"][1].get_feature_names()
features_num

In [None]:
from sklearn.tree import plot_tree # to draw a classification tree
fig, ax = plt.subplots(1,1, figsize=(15, 10))
plot_tree(pipe["classifier"], 
          filled=True, 
          rounded=True, 
          class_names=["<=50k", ">50k"],
          feature_names=features_cat+features_num, ax=ax)
plt.show()