## Exploring the Data

In [1]:
# Import libraries 
import numpy as np
import pandas as pd
from time import time
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs
# Pretty display for notebooks
%matplotlib inline
# Load the Census dataset
data = pd.read_csv("us census data.csv")
# Success - Display the first record
display(data.head(n=10))

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,capital,income
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,2174,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,0,<=50K
5,37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,United-States,0,<=50K
6,49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,16,Jamaica,0,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,0,>50K
8,31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,50,United-States,14084,>50K
9,42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,5178,>50K


In [6]:
data.shape

(48842, 13)

In [None]:
# Total number of records
n_records = len(data)

# Number of records where individual's income is more than $50,000
n_greater_50k = len(data[data['income']=='>50K'])

# Number of records where individual's income is at most $50,000
n_at_most_50k = len(data[data['income']=='<=50K'])

# Percentage of individuals whose income is more than $50,000
greater_percent = (n_greater_50k/n_records)*100

# Print the results
print("Total number of records: {}".format(n_records))
print("Individuals making more than $50,000: {}".format(n_greater_50k))
print("Individuals making at most $50,000: {}".format(n_at_most_50k))
print("Percentage of individuals making more than $50,000: {}%".format(greater_percent))

## Data Preprocessing 

In [None]:
data=data.replace({'?':np.nan})
data.dropna(inplace=True)

In [None]:
data['capital-gain']=[x if int(x)>0 else 0 for x in data['capital']]

In [None]:
data['capital-loss']=[abs(x) if int(x)<0 else 0 for x in data['capital']]

In [None]:
data.drop(['capital'],inplace=True,axis=1)

In [None]:
data.drop(['fnlwgt'],inplace=True,axis=1)

In [None]:
data['education-num']=data['education'].replace({'Bachelors':13.0,'HS-grad':9.0,'11th':7.0,'Masters':14.0,'9th':5.0,
                                               'Some-college':10.0,'Assoc-acdm':12.0,'7th-8th':4.0,'Doctorate':16.0,
                                              'Assoc-voc':12.0,'Prof-school':15.0,'5th-6th':3.0,'10th':6.0,'Preschool':1.0,
                                              '12th':8.0,'1st-4th':2.0})

In [None]:
# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)
# Visualize skewed continuous features of original data
vs.distribution(data)

In [None]:
# Log-transform the skewed features
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))
# Visualize the new log distributions
vs.distribution(features_log_transformed, transformed = True)

In [None]:
# Encode the 'income_raw' data to numerical values
income_raw.replace('<=50K', 0, inplace=True)
income_raw.replace('>50K', 1, inplace=True)
income=income_raw

In [None]:
# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_features=['workclass', 'education','marital-status', 'occupation','relationship', 'race', 'sex','native-country']
features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head(n = 5))

## Generate Final Datasets
Here we generate all the needed datasets and save them in a dictionary. One dataset is that all the categorical variables are one-hot encoding.  For the other dataset,  we first bin the native-country variable into two categories and then one-hot encoding all the categorical variables. Finally, we generate a dataset where all the categorical variables without encoding.

In [None]:
data_final={}
features_final = pd.get_dummies(features_log_minmax_transform)
data_final['native_country_one_hot']=features_final 
data_final['catboost_data']=features_log_minmax_transform.copy()
features_log_minmax_transform['new_native-country']=[x if x=='United-States' else 'other' for x in features_log_minmax_transform['native-country']]
features_log_minmax_transform.drop(['native-country'],inplace=True,axis=1)
native_country_bin = pd.get_dummies(features_log_minmax_transform)
data_final['native_country_bin']=native_country_bin 

## Creating a Training and Predicting Pipeline

In [None]:
# Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.model_selection import train_test_split
def train_predict(learner,features_final): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - features_final: the final dataset      
    '''
    # Split the 'features' and 'income' data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                        income, 
                                                        test_size = 0.2, 
                                                        random_state = 0)   
    results = {}
    # Fit the learner to the training data 
    start = time() # Get start time
    learner = learner.fit(X_train, y_train)
    end = time() # Get end time  
    # Calculate the training time
    results['train_time'] = end-start       
    # Get the predictions on the test set(X_test),
    #       then get predictions on the training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train)
    end = time() # Get end time
    # Calculate the total prediction time
    results['pred_time'] = end-start       
    # Compute accuracy on the training samples
    results['acc_train'] = accuracy_score(y_train,predictions_train)   
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    # Compute F-score on the training samples using fbeta_score()
    results['f_train'] =fbeta_score(y_train,predictions_train, beta=0.5)   
    # Compute F-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)  
    # Success
    print("{} trained.".format(learner.__class__.__name__)) 
    # Return the results
    return results

## Performance of GradientBoostingClassifier and CatBoost 

In [None]:
# Initialize the two models
clf_A = GradientBoostingClassifier(n_estimators=500)
clf_B = CatBoostClassifier(verbose=0, n_estimators=500,cat_features=cat_features,one_hot_max_size=2)
# Collect results on the learners
results = {}
for clf in [clf_A, clf_B]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    if clf==clf_A:
        results[clf_name]=train_predict(clf,data_final['native_country_one_hot'])
    else:
        results[clf_name]=train_predict(clf,data_final['catboost_data'])
# Run metrics visualization for the two models
vs.evaluate(results,'Performance Metrics for GradientBoostingClassifier and CatBoost')

In [None]:
results

### Observations
- First, as we see, CatBoostClassifier has a much lower train and predict time than the GradientBoostingClassifier. This is because the two classifiers use different encoding methods for categorical variables. For GradientBoostingClassifier, we use One-Hot Encoding which increases the dimensionality of the features significantly if there are high cardinality variables. For CatBoostClassifier, we use the built-in default setting of transforming categorical features to numerical features.  One-Hot Encoding is only used for the variables with not more than two categories, otherwise, the Borders method is used. As a result, the computation complexity is much lower and then a much lower computation time. 
- Second, the performance of the two classifiers are almost the same. 

## Performance of GradientBoostingClassifier with Different Encoding Methods for the Variable native-country 

In [None]:
# Initialize the model
clf = GradientBoostingClassifier(n_estimators=500)
# Collect results on the datasets
results = {}
del data_final['catboost_data']
for k, v in data_final.items():
    data_name =k 
    results[data_name] = {}    
    results[data_name]=train_predict(clf,v)    
# Run metrics visualization on the two dataset
vs.evaluate(results,'Performance Metrics for GradientBoostingClassifier with Different Encoding Methods for the Variable native-country')

In [None]:
results

In [None]:
encoded_one_hot = list(data_final['native_country_one_hot'].columns)
print("{} total features after one-hot encoding.".format(len(encoded_one_hot)))
encoded_native_country_bin_one_hot = list(data_final['native_country_bin'].columns)
print("{} total features after native-country bin and one-hot encoding.".format(len(encoded_native_country_bin_one_hot)))

### Observations
- First, as we see, native_country_bin has a much lower train and predict time than the native_country_one_hot. This is because, for native_country_bin, we reduce forty categories to only two categories where we merge all the categories other than 'United States' to one bin before one-hot encoding. As a result, the computation complexity is reduced.
- Second, the performance of native_country_bin is a little higher than the native_country_one_hot. This is because we merge all the categories other than 'United States' to one bin where we remove some information. There is an effect of reducing overfitting. But it does not mean we will always get a higher performance this way. On the other hand, we also removed some information. So the performance will depend. 