# Binary Classification
- Algorithms & Methods
- Class Imbalances
- Model Selection

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
from Preprocessing import IncomePreprocess
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.utils import parallel_backend
from dask.distributed import Client, progress
import joblib
client = Client(processes=False, threads_per_worker=4,
                n_workers=1, memory_limit='8GB')

Dask needs bokeh >= 2.4.2, < 3 for the dashboard.
You have bokeh==2.2.1.
Continuing without the dashboard.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 65360 instead


In [3]:
train_data = pd.read_csv('../data/census_income_learn.csv', header=None)
test_data = pd.read_csv('../data/census_income_test.csv', header=None)

In [4]:
# Preprocessing step
ip  = IncomePreprocess()
df, df_test = ip.label_features(train_data, test_data)
X_train, X_test = ip.preprocess(df, df_test)

In [5]:
X_train.head()

Unnamed: 0,age,class_of_work,industry_code,occupation_code,education,wage_per_hour,enrolled_in_edu_inst_last_wk,marital_status,major_industry_code,major_occupation_code,...,own_business_or_self_employed,fill_inc_questionnaire_for_veterans_admin,veteran_benefits,weeks_worked_in_year,year,y,has_gains,has_losses,has_stock,origin
0,73,0,0,0,9,0,2,4,14,6,...,0,1,2,0,1,0,0.0,0.0,0.0,1
1,58,3,4,34,10,0,2,5,4,8,...,0,1,2,52,0,0,0.0,0.0,0.0,1
2,18,0,0,0,6,0,1,7,14,6,...,0,1,2,0,1,0,0.0,0.0,0.0,2
3,9,0,0,0,0,0,2,7,14,6,...,0,1,0,0,0,0,0.0,0.0,0.0,1
4,10,0,0,0,0,0,2,7,14,6,...,0,1,0,0,0,0,0.0,0.0,0.0,1


In [6]:
# Splitting the features from the response
Y_train = X_train.pop('y')
Y_test = X_test.pop('y')

In [6]:
Y_train

0         0
1         0
2         0
3         0
4         0
         ..
199518    0
199519    0
199520    0
199521    0
199522    0
Name: y, Length: 196294, dtype: int32

In [8]:
# Initializing models
kn = KNeighborsClassifier()
rf = RandomForestClassifier()
lgm = LogisticRegression(solver="liblinear")

In [9]:
(unique, counts) = np.unique(Y_train, return_counts=True)
print(f"Classes of response: {unique}, Count per class: {counts}")

Classes of response: [0 1], Count per class: [183912  12382]


In [9]:
# parallel_backend(backend="threading")

In [10]:
with joblib.parallel_backend('dask'):

    lgm.fit(X_train, Y_train)



In [11]:
# # Train the KNN model
estimators= [('kn', kn), ('rfc',rf), ('lgm',lgm)]

with parallel_backend(backend="dask"):
    stacked_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(solver="liblinear"))
    stacked_model.fit(X_train,Y_train)

In [None]:
# Fitting to some sklearn binary classifiers
# from FeatureSelection import select_features
# estimator, score = select_features(X_train, Y_train)