# H-1B Visa Petition
## 05:Classical Model Tuning
**Author:** Prasoon Karmacharya

In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

from sklearn.metrics import (accuracy_score, confusion_matrix, plot_confusion_matrix,
                             ConfusionMatrixDisplay, classification_report, 
                             f1_score, recall_score, precision_score, 
                             roc_curve, roc_auc_score, hamming_loss, jaccard_score)

RANDOM_STATE = 42


from imblearn.datasets import make_imbalance
from imblearn.under_sampling import RandomUnderSampler

In [2]:
def evaluate_model(X_train, y_train, X_val, y_val, X_test, y_test, y_pred_val, y_pred_test, classifier, clf):
    print("-----------------------------------------------")
    print("Clf: ", classifier.__class__.__name__)
    model_results = {}
    
    model_results['Classifier'] = classifier.__class__.__name__
    
    model_results['Accuracy (train)'] = clf.score(X_train, y_train)
    
    # Validation Performance
    model_results['Accuracy (val)'] = clf.score(X_val, y_val)
    model_results['Accuracy (test)'] = clf.score(X_test, y_test)
    
    model_results['Recall (val)'] = recall_score(y_val, y_pred_val, average='weighted', labels=np.unique(y_pred_val))
    model_results['Precision (val)'] = precision_score(y_val, y_pred_val, average='weighted', labels=np.unique(y_pred_val))
    model_results['F1-score (val)'] = f1_score(y_val, y_pred_val, average='weighted', labels=np.unique(y_pred_val))
    model_results['AUC-ROC (val)'] = roc_auc_score(y_val, y_pred_val)
    model_results['Jacard score (val)'] = jaccard_score(y_val, y_pred_val,average='weighted', labels=np.unique(y_pred_test))
    model_results['Hamming Loss (val)'] = hamming_loss(y_val, y_pred_val)
    
    model_results['CV Accuracy'] = cross_val_score(clf, X_train, y_train, cv=5).mean()
    
    # Test Performance
    model_results['Recall (test)'] = recall_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['Precision (test)'] = precision_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['F1-score (test)'] = f1_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['AUC-ROC (test)'] = roc_auc_score(y_test, y_pred_test)
    model_results['Jacard score (test)'] = jaccard_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['Hamming Loss (test)'] = hamming_loss(y_test, y_pred_test)

    return model_results


def run_model(classifier, X_train, y_train, X_val, y_val, X_test, y_test):
    model = classifier
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    
    return model, y_pred_val, y_pred_test

In [3]:
# Read Data

In [4]:
data = pd.read_pickle("../assets/data/cleaned_h1b.pkl")

In [5]:
data.head()

Unnamed: 0,CASE_STATUS,EMPLOYER_NAME,SOC_NAME,JOB_TITLE,FULL_TIME_POSITION,PREVAILING_WAGE,YEAR,WORKSITE,YEAR_coded,SOC_NAME_coded
0,1,UNIVERSITY OF MICHIGAN,BIOCHEMISTS AND BIOPHYSICISTS,POSTDOCTORAL RESEARCH FELLOW,0,36067.0,2016.0,"ANN ARBOR, MICHIGAN",5,192
1,1,"GOODMAN NETWORKS, INC.",CHIEF EXECUTIVES,CHIEF OPERATING OFFICER,1,242674.0,2016.0,"PLANO, TEXAS",5,297
2,1,"PORTS AMERICA GROUP, INC.",CHIEF EXECUTIVES,CHIEF PROCESS OFFICER,1,193066.0,2016.0,"JERSEY CITY, NEW JERSEY",5,297
3,1,"GATES CORPORATION, A WHOLLY-OWNED SUBSIDIARY O...",CHIEF EXECUTIVES,"REGIONAL PRESIDEN, AMERICAS",1,220314.0,2016.0,"DENVER, COLORADO",5,297
5,1,BURGER KING CORPORATION,CHIEF EXECUTIVES,"EXECUTIVE V P, GLOBAL DEVELOPMENT AND PRESIDEN...",1,225000.0,2016.0,"MIAMI, FLORIDA",5,297


In [6]:
feature_cols = ['FULL_TIME_POSITION', 'PREVAILING_WAGE', 'YEAR_coded', 'SOC_NAME_coded']
target_col = ['CASE_STATUS']

* Split the data into dev, validation, and test set 60:10:10

In [7]:
# dev, test = train_test_split(data, test_size=0.2, random_state=RANDOM_STATE)
dev, test = train_test_split(data, test_size=0.15, random_state=RANDOM_STATE)

In [8]:
train, val = train_test_split(dev, test_size=0.15, random_state=RANDOM_STATE)

In [9]:
train.shape[0]/data.shape[0]

0.7224995916862134

In [10]:
val.shape[0]/data.shape[0]

0.1275002529301046

In [11]:
test.shape[0]/data.shape[0]

0.15000015538368203

In [12]:
X_train = train[feature_cols]
y_train = train[target_col]

X_val = val[feature_cols]
y_val = val[target_col]

X_test = test[feature_cols]
y_test = test[target_col]

In [13]:
y_train["CASE_STATUS"].value_counts(normalize=True)

1    0.967274
0    0.032726
Name: CASE_STATUS, dtype: float64

In [14]:
y_train["CASE_STATUS"].value_counts()

1    2023925
0      68475
Name: CASE_STATUS, dtype: int64

In [15]:
68475*(.75/.25)

205425.0

In [16]:
sampling_strategy = {0:68475, 1:205425}
X_train_balanced, y_train_balanced = make_imbalance(X_train, 
                                                    y_train, 
                                                    sampling_strategy=sampling_strategy, 
                                                    random_state=RANDOM_STATE)

In [17]:
y_train_balanced["CASE_STATUS"].value_counts(normalize=True)

1    0.75
0    0.25
Name: CASE_STATUS, dtype: float64

In [18]:
def evaluate_model(clf, X_train, y_train, X_val, y_val, X_test, y_test, y_pred_val, y_pred_test):
    print("-----------------------------------------------")

    model_results = {}
    
    model_results['Classifier'] = clf.__class__.__name__
    
    model_results['Best Parameters'] = clf.best_params_
    
    model_results['Accuracy (train)'] = clf.score(X_train, y_train)
    
    # Validation Performance
    model_results['Accuracy (val)'] = clf.score(X_val, y_val)
    model_results['Accuracy (test)'] = clf.score(X_test, y_test)
    
    model_results['Recall (val)'] = recall_score(y_val, y_pred_val, average='weighted', labels=np.unique(y_pred_val))
    model_results['Precision (val)'] = precision_score(y_val, y_pred_val, average='weighted', labels=np.unique(y_pred_val))
    model_results['F1-score (val)'] = f1_score(y_val, y_pred_val, average='weighted', labels=np.unique(y_pred_val))
    model_results['AUC-ROC (val)'] = roc_auc_score(y_val, y_pred_val)
    model_results['Jacard score (val)'] = jaccard_score(y_val, y_pred_val,average='weighted', labels=np.unique(y_pred_test))
    model_results['Hamming Loss (val)'] = hamming_loss(y_val, y_pred_val)
    
    model_results['CV Accuracy'] = cross_val_score(clf, X_train, y_train, cv=5).mean()
    
    # Test Performance
    model_results['Recall (test)'] = recall_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['Precision (test)'] = precision_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['F1-score (test)'] = f1_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['AUC-ROC (test)'] = roc_auc_score(y_test, y_pred_test)
    model_results['Jacard score (test)'] = jaccard_score(y_test, y_pred_test, average='weighted', labels=np.unique(y_pred_test))
    model_results['Hamming Loss (test)'] = hamming_loss(y_test, y_pred_test)   
    
    return model_results

## Model Tunning one at a time

### Model 1: Logistic Regression

In [23]:
from pathlib import Path
logistic_reg_model_benchmark_file_path = Path("../assets/model_performance/01_logistic_regression_model_performance.csv")

if logistic_reg_model_benchmark_file_path.is_file():
    model_benchmark = pd.read_csv(logistic_reg_model_benchmark_file_path)
else:
    model_benchmark = pd.DataFrame(columns=['Classifier', 'Best Parameters', 'Accuracy (train)', 'Accuracy (val)', 'Accuracy (test)', 'CV Accuracy', 
                                    'Recall (val)', 'Precision (val)', 'F1-score (val)', 'AUC-ROC (val)', 'Jacard score (val)', 'Hamming Loss (val)',
                                    'Recall (test)', 'Precision (test)', 'F1-score (test)', 'AUC-ROC (test)', 'Jacard score (test)', 'Hamming Loss (test)'
                                   ])

In [24]:
model_benchmark

Unnamed: 0,Classifier,Best Parameters,Accuracy (train),Accuracy (val),Accuracy (test),CV Accuracy,Recall (val),Precision (val),F1-score (val),AUC-ROC (val),Jacard score (val),Hamming Loss (val),Recall (test),Precision (test),F1-score (test),AUC-ROC (test),Jacard score (test),Hamming Loss (test)
0,GridSearchCV,"{'C': 0.0001, 'penalty': 'l2'}",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
1,GridSearchCV,"{'C': 1e-05, 'penalty': 'l2'}",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
2,GridSearchCV,"{'C': 1e-05, 'penalty': 'none'}",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
3,GridSearchCV,"{'C': 1e-05, 'class_weight': 'none', 'penalty'...",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
4,GridSearchCV,"{'C': 1e-05, 'class_weight': 'balanced', 'pena...",0.0,0.0,0.0,0.0,1.0,0.032252,0.062489,0.5,0.032252,0.967748,1.0,0.03212,0.06224,0.5,0.03212,0.96788
5,GridSearchCV,"{'C': 0.01, 'penalty': 'l1', 'solver': 'liblin...",0.861858,0.980864,0.981059,0.861751,0.962529,0.945757,0.952617,0.530259,0.933194,0.037471,0.962904,0.946179,0.952951,0.530515,0.933693,0.037096


In [25]:
# parameters for Gridsearch

lr_params = {
    'penalty' : ['l1', 'none'],
    'C': np.linspace(0.000001,0.001,5),
    'solver': ['liblinear', 'saga' ]
    
}

lr = LogisticRegression()

lr_GS = GridSearchCV(lr, lr_params, scoring='f1', cv=5, verbose=True, n_jobs=-1)

lr_GS.fit(X_train_balanced, y_train_balanced)

y_pred_val = lr_GS.predict(X_val)
y_pred_test = lr_GS.predict(X_test)


# Add to the history
evaluation_result = evaluate_model(clf=lr_GS, 
                       X_train = X_train_balanced, 
                       y_train = y_train_balanced,
                       X_val = X_val,
                       y_val = y_val,
                       X_test = X_test, 
                       y_test = y_test,
                       y_pred_val = y_pred_val, 
                       y_pred_test = y_pred_test)




Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.5min finished
  return f(**kwargs)


-----------------------------------------------
Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
  return f(**kwargs)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
  return f(**kwargs)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
  return f(**kwargs)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
  return f(**kwargs)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.2min finished
  return f(**kwargs)


In [26]:
model_benchmark = model_benchmark.append(evaluation_result, ignore_index=True)


In [27]:
model_benchmark

Unnamed: 0,Classifier,Best Parameters,Accuracy (train),Accuracy (val),Accuracy (test),CV Accuracy,Recall (val),Precision (val),F1-score (val),AUC-ROC (val),Jacard score (val),Hamming Loss (val),Recall (test),Precision (test),F1-score (test),AUC-ROC (test),Jacard score (test),Hamming Loss (test)
0,GridSearchCV,"{'C': 0.0001, 'penalty': 'l2'}",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
1,GridSearchCV,"{'C': 1e-05, 'penalty': 'l2'}",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
2,GridSearchCV,"{'C': 1e-05, 'penalty': 'none'}",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
3,GridSearchCV,"{'C': 1e-05, 'class_weight': 'none', 'penalty'...",0.859083,0.983884,0.983936,0.859082,0.968298,0.968145,0.953274,0.508848,0.937622,0.031702,0.968396,0.968465,0.953378,0.508269,0.937807,0.031604
4,GridSearchCV,"{'C': 1e-05, 'class_weight': 'balanced', 'pena...",0.0,0.0,0.0,0.0,1.0,0.032252,0.062489,0.5,0.032252,0.967748,1.0,0.03212,0.06224,0.5,0.03212,0.96788
5,GridSearchCV,"{'C': 0.01, 'penalty': 'l1', 'solver': 'liblin...",0.861858,0.980864,0.981059,0.861751,0.962529,0.945757,0.952617,0.530259,0.933194,0.037471,0.962904,0.946179,0.952951,0.530515,0.933693,0.037096
6,GridSearchCV,"{'C': 0.001, 'penalty': 'l1', 'solver': 'libli...",0.861506,0.98217,0.98219,0.861474,0.965029,0.947314,0.953521,0.525747,0.935432,0.034971,0.965065,0.947191,0.953564,0.524841,0.935538,0.034935


In [28]:
model_benchmark.to_csv("../assets/model_performance/01_logistic_regression_model_performance.csv", index=False)

### Model 2: Linear SVC

In [30]:
if Path("../assets/model_performance/02_linear_SVC_model_performance.csv").is_file():
    model_benchmark = pd.read_csv(linear_SVC_model_benchmark_file_path)
else:
    model_benchmark = pd.DataFrame(columns=['Classifier', 'Best Parameters', 'Accuracy (train)', 'Accuracy (val)', 'Accuracy (test)', 'CV Accuracy', 
                                    'Recall (val)', 'Precision (val)', 'F1-score (val)', 'AUC-ROC (val)', 'Jacard score (val)', 'Hamming Loss (val)',
                                    'Recall (test)', 'Precision (test)', 'F1-score (test)', 'AUC-ROC (test)', 'Jacard score (test)', 'Hamming Loss (test)'
                                   ])

In [31]:
model_benchmark

Unnamed: 0,Classifier,Best Parameters,Accuracy (train),Accuracy (val),Accuracy (test),CV Accuracy,Recall (val),Precision (val),F1-score (val),AUC-ROC (val),Jacard score (val),Hamming Loss (val),Recall (test),Precision (test),F1-score (test),AUC-ROC (test),Jacard score (test),Hamming Loss (test)


In [36]:
# parameters for Gridsearch

SVC_params = {
    'penalty' : ['l2'],
    'C': np.linspace(0.000001,0.0001,5),
    'loss': ['hinge']
    
}

SVC = LinearSVC(random_state=0, tol=1e-05)

SVC_GS = GridSearchCV(SVC, SVC_params, scoring='f1', cv=5, verbose=True, n_jobs=-1)

SVC_GS.fit(X_train_balanced, y_train_balanced)

y_pred_val = SVC_GS.predict(X_val)
y_pred_test = SVC_GS.predict(X_test)


# Add to the history
evaluation_result = evaluate_model(clf=SVC_GS, 
                       X_train = X_train_balanced, 
                       y_train = y_train_balanced,
                       X_val = X_val,
                       y_val = y_val,
                       X_test = X_test, 
                       y_test = y_test,
                       y_pred_val = y_pred_val, 
                       y_pred_test = y_pred_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  7.7min finished
  return f(**kwargs)


-----------------------------------------------
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  5.7min finished
  return f(**kwargs)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  5.9min finished
  return f(**kwargs)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  5.9min finished
  return f(**kwargs)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  6.3min finished
  return f(**kwargs)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  6.4min finished
  return f(**kwargs)


In [37]:
model_benchmark = model_benchmark.append(evaluation_result, ignore_index=True)


In [38]:
model_benchmark

Unnamed: 0,Classifier,Best Parameters,Accuracy (train),Accuracy (val),Accuracy (test),CV Accuracy,Recall (val),Precision (val),F1-score (val),AUC-ROC (val),Jacard score (val),Hamming Loss (val),Recall (test),Precision (test),F1-score (test),AUC-ROC (test),Jacard score (test),Hamming Loss (test)
0,GridSearchCV,"{'C': 0.00025075000000000005, 'loss': 'hinge',...",0.857699,0.982235,0.982307,0.342999,0.965107,0.941325,0.951451,0.506104,0.934418,0.034893,0.965245,0.941252,0.951577,0.505498,0.934642,0.034755
1,GridSearchCV,"{'C': 0.0001, 'loss': 'hinge', 'penalty': 'l2'}",0.857746,0.982393,0.982452,0.171436,0.965411,0.941439,0.951545,0.505692,0.934683,0.034589,0.965523,0.941234,0.951642,0.504949,0.934875,0.034477


In [39]:
SVC_GS.best_params_

{'C': 0.0001, 'loss': 'hinge', 'penalty': 'l2'}

In [40]:
model_benchmark.to_csv("../assets/model_performance/02_linear_SVC_model_performance.csv", index=False)