# Binary Classification
- Algorithms & Methods
- Class Imbalances
- Model Selection

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.utils import parallel_backend
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from FeatureSelection import select_features
from Preprocessing import IncomePreprocess




In [2]:
train_data = pd.read_csv('../data/census_income_learn.csv', header=None)
test_data = pd.read_csv('../data/census_income_test.csv', header=None)

In [3]:
# Preprocessing step
ip  = IncomePreprocess()
# df, df_test = ip.label_features(train_data, test_data)
X_train, X_test = ip.preprocess(train_data, test_data)

In [4]:
# STD for wage per hour was huge before
X_train.head()
X_train.std(axis=0)
# X_train.columns

age                                          22.210001
class_of_work                                 0.867545
industry_code                                18.106401
occupation_code                              14.498128
education                                     4.675408
wage_per_hour                                 0.027708
enrolled_in_edu_inst_last_wk                  0.376869
marital_status                                2.798346
major_industry_code                           4.815376
major_occupation_code                         3.129627
race                                          0.866609
hispanic_origin                               2.074371
sex                                           0.499532
member_of_labor_union                         0.304504
reason_for_unemployment                       0.317726
full_or_part_time_employment_stat             1.202021
tax_filer_status                              1.392657
region of previous residence                  0.467167
state_of_p

In [5]:
# Splitting the features from the response
Y_train = X_train.pop('y')
Y_test = X_test.pop('y')

In [6]:
# Initializing models
kn = KNeighborsClassifier()
rf = RandomForestClassifier()
lgm = LogisticRegression(solver="liblinear")

In [7]:
(unique, counts) = np.unique(Y_train, return_counts=True)
print(f"Classes of response: {unique}, Count per class: {counts}")

Classes of response: [0 1], Count per class: [183912  12382]


In [8]:
lgm.fit(X_train, Y_train)

In [13]:
y_pred = lgm.predict(X_test)
print('LogisticRegression Results')
print('---------------------------')
print('Confusion matrix \n', confusion_matrix(Y_test,y_pred))
print('Classifcation report \n', classification_report(Y_test,y_pred))
print('Accuracy', accuracy_score(Y_test, y_pred))

LogisticRegression Results
---------------------------
Confusion matrix 
 [[91676  1017]
 [ 4144  2042]]
Classifcation report 
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     92693
           1       0.67      0.33      0.44      6186

    accuracy                           0.95     98879
   macro avg       0.81      0.66      0.71     98879
weighted avg       0.94      0.95      0.94     98879

Accuracy 0.9478048928488354


In [13]:
# # Train the KNN model
# estimators= [('kn', kn), ('rfc',rf), ('lgm',lgm)]

# with parallel_backend(backend="dask"):
#     stacked_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(solver="liblinear"))
#     stacked_model.fit(X_train,Y_train)

In [15]:
# Fitting to some sklearn binary classifiers
# from FeatureSelection import select_features
# estimator, score = select_features(X_train, Y_train)