# Model 4: Classification Tree

## Importing the libraries and dataset

In [1]:
### Importing the libraries
import numpy as np
import pandas as pd
#from sklearnex import patch_sklearn

#patch_sklearn()

#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config


from sklearn.tree import DecisionTreeClassifier

#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
train_df.head()

Unnamed: 0,id,occ_code_level2,age,stock_dividends,mig_chg_msa,tax_filer_stat,det_hh_summ,mig_prev_sunbelt,hisp_origin,education,wage_per_hour,capital_losses,vet_question,own_or_self,country_self,mig_move_reg,high_income,hs_college,class_worker,mig_same,unemp_reason,state_prev_res,ind_code_level2,race,country_mother,capital_gains,sex,ind_code_level1,citizenship,union_member,fam_under_18,marital_stat,region_prev_res,mig_chg_reg,country_father,occ_code_level1,full_or_part_emp,weeks_worked,det_hh_fam_stat,num_emp,vet_benefits
0,1,0,42.0,0.0,,Nonfiler,Householder,,All other,11th grade,0.0,,Not in universe,0,United-States,,0,Not in universe,Not in universe,Not in universe under 1 year old,Not in universe,Not in universe,0,Black,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Not in universe,,United-States,Not in universe,Not in labor force,0.0,Householder,0,2
1,2,18,56.0,,,,Householder,,All other,High school graduate,0.0,,Not in universe,2,United-States,,1,Not in universe,Self-employed-incorporated,Not in universe under 1 year old,,Not in universe,32,,United-States,,Male,Wholesale trade,Native- Born in the United States,Not in universe,Not in universe,Married-civilian spouse present,Not in universe,,United-States,Sales,Full-time schedules,,Householder,1,2
2,3,26,26.0,,,Joint both under 65,Householder,,All other,High school graduate,0.0,,Not in universe,0,Haiti,,0,Not in universe,Private,Not in universe under 1 year old,,Not in universe,41,,Haiti,,,Hospital services,Foreign born- Not a citizen of U S,Not in universe,Not in universe,,Not in universe,,Haiti,Adm support including clerical,Full-time schedules,,Householder,3,2
3,4,0,67.0,,MSA to MSA,Joint one under 65 & one 65+,Householder,No,All other,,0.0,,No,0,United-States,Same county,0,Not in universe,Not in universe,No,,North Carolina,0,Black,United-States,0.0,,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Northeast,Same county,United-States,Not in universe,Children or Armed Forces,0.0,Householder,0,1
4,5,0,,,Nonmover,Nonfiler,Child under 18 never married,Not in universe,All other,Children,0.0,0.0,Not in universe,0,United-States,Nonmover,0,Not in universe,Not in universe,Yes,Not in universe,Not in universe,0,White,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Mother only present,,Not in universe,Nonmover,United-States,Not in universe,Children or Armed Forces,,Child <18 never marr not in subfamily,0,0


## Cleaning the data

In [4]:
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [5]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [6]:
X= train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [7]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## Pipeline

In [8]:

numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="most_frequent")),
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier(ccp_alpha=0, random_state=1,criterion='entropy'))]
)


set_config(display="diagram")
clf

In [9]:
clf.fit(X,y)

In [16]:
clf[-1]#.cost_complexity_pruning_path(X, y)

## Cost of complexity pruning :

In [18]:
path = clf[-1].cost_complexity_pruning_path(
    clf[:-1].transform(X),
    y,
)

In [49]:

alphas = path["ccp_alphas"]
alphas_new = [a for i,a in enumerate(alphas) if i%450 == 0 and a != 0]
len(alphas_new)

7

In [50]:
hyper_param = {"classifier__ccp_alpha": alphas_new}
decision_tree_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [51]:
decision_tree_pipe_cv.fit(X,y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  9.3min finished


In [52]:
pd.DataFrame(decision_tree_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__ccp_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,60.747044,1.238487,0.26487,0.006724,3.6e-05,{'classifier__ccp_alpha': 3.620981231694854e-05},0.811939,0.81258,0.807985,0.81064,0.808992,0.810427,0.001732,7
1,64.263975,1.201488,0.290639,0.008542,4.4e-05,{'classifier__ccp_alpha': 4.3991857282162366e-05},0.814045,0.813953,0.809999,0.811098,0.811189,0.812057,0.001641,6
2,65.549243,0.169792,0.303983,0.013287,5e-05,{'classifier__ccp_alpha': 5.0449347192914254e-05},0.816334,0.815418,0.812655,0.814303,0.813204,0.814383,0.001361,5
3,65.884993,0.262634,0.296547,0.008975,5.9e-05,{'classifier__ccp_alpha': 5.942667602755199e-05},0.818623,0.818074,0.8165,0.819522,0.816409,0.817826,0.001212,4
4,64.737191,0.214026,0.281799,0.011023,7e-05,{'classifier__ccp_alpha': 7.038927482465402e-05},0.823292,0.825398,0.823093,0.823551,0.816317,0.82233,0.003117,3
5,64.803123,0.356387,0.275144,0.009828,9e-05,{'classifier__ccp_alpha': 8.973606075186131e-05},0.832174,0.835927,0.836187,0.830418,0.828953,0.832732,0.002901,2
6,48.81394,12.60563,0.218506,0.053195,0.000239,{'classifier__ccp_alpha': 0.00023903453246295128},0.845266,0.842794,0.849647,0.847541,0.842963,0.845643,0.002649,1


In [59]:
# WE should have alpha higher than 0.000239
alphas_new = np.linspace(0.00026, 0.0005,5)
hyper_param = {"classifier__ccp_alpha": alphas_new}
decision_tree_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)
decision_tree_pipe_cv.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  8.1min finished


In [60]:
pd.DataFrame(decision_tree_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__ccp_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,78.497126,0.680255,0.337523,0.015351,0.00026,{'classifier__ccp_alpha': 0.00026},0.845633,0.845083,0.850105,0.84745,0.842505,0.846155,0.002531,1
1,77.344055,2.901553,0.328314,0.013298,0.00032,{'classifier__ccp_alpha': 0.00031999999999999997},0.843802,0.843985,0.846626,0.845252,0.842231,0.844379,0.001478,2
2,76.642316,2.481824,0.463004,0.162874,0.00038,{'classifier__ccp_alpha': 0.00038},0.84252,0.841604,0.842688,0.842597,0.841864,0.842255,0.000436,3
3,82.271126,1.648139,0.436431,0.190597,0.00044,{'classifier__ccp_alpha': 0.00044},0.839132,0.842428,0.838202,0.841681,0.840308,0.84035,0.001561,4
4,58.277561,17.333712,0.257414,0.051038,0.0005,{'classifier__ccp_alpha': 0.0005},0.837576,0.841238,0.837927,0.839026,0.837561,0.838665,0.001393,5
