# GOOD FAST CHEAP - 20 Feature Constraint - Modeling

---

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, \
ConfusionMatrixDisplay

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

from xgboost import XGBClassifier

---

## Reading cleaned data 

In [6]:
# Read the data
df = pd.read_csv('../data/train_cleaned.csv', index_col = [0])
test = pd.read_csv('../data/test_cleaned.csv', index_col = [0])

---

## Modeling

In [7]:
# Select X and y
X = df.drop(columns=['wage'])
y = df['wage']

In [8]:
X.head(2)

Unnamed: 0,age,workclass,education,education_num,marital_status,occupation,relationship,capital_gain,capital_loss,hours_per_week,native_country,is_male
0,39,Other,Other,13,Other,Other,2.0,2174,0,40,United-States,1
1,50,Other,Other,13,1.0,Group1,1.0,0,0,13,United-States,1


In [9]:
# Dummy up categorical variable
X = pd.get_dummies(data=X, 
                   columns = ['native_country','workclass','occupation',
                              'education','relationship','marital_status'],
                   drop_first = True)

In [10]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2,
                                                    stratify=y,
                                                    random_state = 42)

In [11]:
# Check shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048,), (6513,))

In [12]:
# Check baseline
y_train.value_counts(normalize=True)

0    0.759175
1    0.240825
Name: wage, dtype: float64

In [13]:
# Scale our data
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

---

## Function to generate metrics

In [14]:
def predict_and_classify(model):
    
    # Generate Prediction
    y_train_pred = model.predict(Z_train)
    y_test_pred = model.predict(Z_test)
    
    print('-----------------------')
    print('Training Classification')
    print(classification_report(y_train,y_train_pred))
    print('')
    print('-----------------------')
    print('Testing Classification')
    print(classification_report(y_test,y_test_pred))

---

## Fitting LogReg

In [15]:
# Instantiate and fit
lr = LogisticRegression()
lr.fit(Z_train, y_train)

In [16]:
predict_and_classify(lr)

-----------------------
Training Classification
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     19775
           1       0.73      0.58      0.65      6273

    accuracy                           0.85     26048
   macro avg       0.80      0.76      0.77     26048
weighted avg       0.84      0.85      0.84     26048


-----------------------
Testing Classification
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      4945
           1       0.75      0.60      0.66      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.77      0.79      6513
weighted avg       0.85      0.85      0.85      6513



----

## Fitting XGBClassifier

In [17]:
xg_default = XGBClassifier()
xg_default.fit(Z_train,y_train)

In [18]:
predict_and_classify(xg_default)

-----------------------
Training Classification
              precision    recall  f1-score   support

           0       0.90      0.95      0.93     19775
           1       0.82      0.67      0.74      6273

    accuracy                           0.88     26048
   macro avg       0.86      0.81      0.83     26048
weighted avg       0.88      0.88      0.88     26048


-----------------------
Testing Classification
              precision    recall  f1-score   support

           0       0.89      0.94      0.92      4945
           1       0.77      0.65      0.70      1568

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.87      0.86      6513



---

## Fitting XGBClassifier (Fine Tuning)

In [19]:
# Create param grid
param_grid = {
    'colsample_bytree':[0.3,0.6,0.9],
    'learning_rate':[0.1,0.05,0.02],
    'max_depth':[2,3,5],
    'n_estimators':[100,200,500]}

In [20]:
# Gridsearch
gs = GridSearchCV(estimator = xg_default,
                  param_grid = param_grid,
                  n_jobs = -1,
                  verbose = 0)

In [21]:
# Get results
gs.fit(Z_train, y_train)

In [22]:
# Print results
print(f"Best parameters : {gs.best_params_}")
print(f"Best score : {gs.best_score_}")

Best parameters : {'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best score : 0.8681663841078245


In [23]:
# Transfer best parameter
xg = XGBClassifier(colsample_bytree = gs.best_params_['colsample_bytree'], 
                   learning_rate = gs.best_params_['learning_rate'], 
                   max_depth = gs.best_params_['max_depth'],
                   n_estimators = gs.best_params_['n_estimators'])
xg.fit(Z_train,y_train)

In [24]:
predict_and_classify(xg)

-----------------------
Training Classification
              precision    recall  f1-score   support

           0       0.89      0.95      0.92     19775
           1       0.80      0.64      0.71      6273

    accuracy                           0.87     26048
   macro avg       0.85      0.79      0.81     26048
weighted avg       0.87      0.87      0.87     26048


-----------------------
Testing Classification
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      4945
           1       0.79      0.64      0.71      1568

    accuracy                           0.87      6513
   macro avg       0.84      0.79      0.81      6513
weighted avg       0.87      0.87      0.87      6513



----

## Retrain whole dataset

In [38]:
# Scale the whole dataset
whole_sc = StandardScaler()
Z = whole_sc.fit_transform(X)

In [39]:
# Train with default
final_model = XGBClassifier(colsample_bytree = gs.best_params_['colsample_bytree'], 
                   learning_rate = gs.best_params_['learning_rate'], 
                   max_depth = gs.best_params_['max_depth'],
                   n_estimators = gs.best_params_['n_estimators'])
final_model.fit(Z,y)

---

## Generate Prediction 

In [27]:
# Dummy test
test = pd.get_dummies(data=test, 
                      columns = ['native_country','workclass','occupation',
                              'education','relationship','marital_status'],
                      drop_first = True)

In [28]:
test.shape

(16281, 15)

In [31]:
# Scale
test_sc = whole_sc.transform(test)

In [40]:
y_pred = final_model.predict(test_sc)

In [41]:
submission = pd.DataFrame(data=y_pred,columns=['wage'])

In [42]:
submission

Unnamed: 0,wage
0,0
1,0
2,0
3,1
4,0
...,...
16276,0
16277,0
16278,1
16279,0


In [43]:
submission.to_csv('../data/kris-eye-predictions.csv',index=False)

In [44]:
# Final model is XGBoost with fine-tuned parameters
# Submission F1-score 91

---