In [1]:
import pandas as pd

all_postings = pd.read_csv('processed_description.csv')


### Split validation, training and test set

In [2]:
# define target class and input text
target = all_postings['salary_bin'].astype('category')
text = all_postings['processed_description']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# split data into 70% training, 10% validation and 20% testing set
tfidf = TfidfVectorizer().fit(text)
X_tfidf =  tfidf.transform(text)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, target, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125)


### TF-IDF Vectorization

In [4]:
# map salary bin to range
salary_info = {all_postings.salary_bin.unique()[i]: all_postings.salary_range.unique()[i] for i in range(len(all_postings.salary_bin.unique()))}
print(salary_info)

{1: '50k-100k', 0: '0-50k', 3: '150k+', 2: '100k-150k'}


## ML Models 
### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score, f1_score
import numpy as np


# tune the hyperparameter C using the validation set
C_values = [0.01, 0.1, 1, 10, 100]
val_accuracy = []
for C in C_values:
    logistic_regression = LogisticRegression(C=C, max_iter=1000, penalty='l2',multi_class='ovr')
    logistic_regression.fit(X_val, y_val)
    
    y_val_pred = logistic_regression.predict(X_val)
    val_accuracy.append(accuracy_score(y_val, y_val_pred))

best_C = C_values[np.argmax(val_accuracy)]

# fit the model on the training set using the best C
logistic_regression = LogisticRegression(C=C, max_iter=1000, penalty='l2',  multi_class='ovr')
logistic_regression.fit(X_train, y_train)


# evaluate the model on the test set
y_test_pred = logistic_regression.predict(X_test)
y_test_pred_proba = logistic_regression.predict_proba(X_test)


logreg_f1 = f1_score(y_test, y_test_pred, average='weighted')
logreg_auc = roc_auc_score(y_test, y_test_pred_proba, average='weighted', multi_class='ovr')

print(f'Logistic Regression F1 Score: {logreg_f1}')
print(f'Logistic Regression AUC Score: {logreg_auc}')



Logistic Regression F1 Score: 0.7019577414635976
Logistic Regression AUC Score: 0.8838291292022542


### SVM

In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, roc_auc_score

class_weight = dict(all_postings['salary_bin'].value_counts(normalize=True))
# tune the hyperparameter C using the validation set
C_values = [0.01, 0.1, 1, 10, 100]
val_accuracy = []
for C in C_values:
    svm = LinearSVC(C=C, max_iter=10000, class_weight=class_weight, multi_class='ovr')
    svm.fit(X_val, y_val)
    
    y_val_pred = svm.predict(X_val)
    val_accuracy.append(accuracy_score(y_val, y_val_pred))

best_C = C_values[np.argmax(val_accuracy)]

# fit the model on the training set using the best C
svm = LinearSVC(C=best_C, max_iter=10000, class_weight=class_weight)
svm.fit(X_train, y_train)

# evaluate the model on the test set
y_test_pred = svm.predict(X_test)


svm_f1 = f1_score(y_test, y_test_pred, average='weighted')
svm_auc = roc_auc_score(y_test, y_test_pred_proba, average='weighted', multi_class='ovr')
print(f'Linear SVC F1 Score: {svm_f1}')
print(f'Linear SVC AUC Score: {svm_auc}')





Linear SVC F1 Score: 0.6939897027705659
Linear SVC AUC Score: 0.8838291292022542


### Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
clf = RandomForestClassifier(n_estimators=1000, max_depth=50, bootstrap=True)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))
print('AUC Score: ', roc_auc_score(y_test, clf.predict_proba(X_test), average='weighted', multi_class='ovr'))





Accuracy:  0.6333333333333333
F1 Score:  0.6128135289724997
AUC Score:  0.8876954041432455


### AdaBoost

### XGBoost


In [8]:

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

xgb = XGBClassifier(n_estimators=1000, learning_rate=0.1)
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))
print('AUC Score: ', roc_auc_score(y_test, y_pred_proba, average='weighted', multi_class='ovr'))


Accuracy:  0.753558052434457
F1 Score:  0.7517011744654779
AUC Score:  0.9175324994001782
