# Task for Today  

***

## Interview Success Prediction  

Given *data about resumes*, let's try to predict whether a candidate will **pass their interview** based on their resume.

We will use three models to make our predictions and PCA for dimensionality reduction to make our predictions.

# Getting Started

In [None]:
# ML modules
import pandas as pd
import numpy as np

# kaggle api
import os
import kagglehub
import shutil

# download dataset from kaggle
path = kagglehub.dataset_download("vingkan/strategeion-resume-skills")
print("Path to dataset files:", path)
files = os.listdir(path)
print("Files in dataset directory:", files)

# create dataframes
dev_data = pd.read_csv(os.path.join(path, 'resumes_development.csv'))
pilot_data = pd.read_csv(os.path.join(path, 'resumes_pilot.csv'))

Path to dataset files: /root/.cache/kagglehub/datasets/vingkan/strategeion-resume-skills/versions/2
Files in dataset directory: ['fairness.py', 'resumes_development.csv', 'PARiS.pickle', 'skills.txt', 'resumes_pilot.csv']


In [None]:
count1 = 0
count2 = 0
count3 = 0
count4 = 0
count5 = 0
count6 = 0
count7 = 0
my_data = pilot_data
for i in range(len(my_data)):
  row = my_data.iloc[i]
  if row['Female']:
    count1 += 1
  if row['URM']:
    count2 += 1
  if row['Disability']:
    count3 += 1
  if row['Female'] and row['URM']:
    count4 += 1
  if row['Female'] and row['Disability']:
    count5 += 1
  if row['URM'] and row['Disability']:
    count6 += 1
  if row['Female'] and row['URM'] and row['Disability']:
    count7 += 1
print(count1, count2, count3, count4, count5, count6, count7)

976 930 872 901 763 791 762


# Preprocessing

In [None]:
# data transformations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def preprocess_inputs(data, seed=1):
    df = data.copy()

    # drop index column
    df = df.drop('Unnamed: 0', axis=1)

    # split df into X, y, and Hara
    y = df['Interview']
    X = df.drop('Interview', axis=1)

    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=seed)

    # scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train_scaled, X_test_scaled, y_train, y_test, X_train, X_test

# Measuring bias

In [None]:
def bias_loss(data):

  prot_feats = ['Female', 'URM', 'Disability']
  unprot_feat = 'Veteran'

  # init state
  sum_loss = 0
  loss_n = 0
  output = {'Female': {
              'Veteran': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0},
              'Civilian': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0}},
            'URM': {
              'Veteran': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0},
              'Civilian': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0}},
            'Disability': {
              'Veteran': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0},
              'Civilian': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0}}}

  # iterate
  for prot_feat in prot_feats:
    for unprot_name, unprot_value in [('Veteran', 1), ('Civilian', 0)]:
      total = data[( data[unprot_feat] == unprot_value)]
      # get matching applicants
      pos_total = total[( total[prot_feat] == 1 )]
      neg_total = total[( total[prot_feat] == 0 )]

      # get matching accepted applicants
      pos_hired = pos_total[( pos_total['Interview'] == 1 )]
      neg_hired = neg_total[( neg_total['Interview'] == 1 )]

      # get lengths
      pos_total = len(pos_total)
      neg_total = len(neg_total)
      pos_hired = len(pos_hired)
      neg_hired = len(neg_hired)

      # get loss for this feature
      pos_rate = pos_hired / pos_total # female acceptance rate
      neg_rate = neg_hired / neg_total # male acceptance rate
      rate_diff = neg_rate - pos_rate
      sum_loss += rate_diff
      loss_n += 1

      output[prot_feat][unprot_name]['pos_rate'] = pos_rate
      output[prot_feat][unprot_name]['neg_rate'] = neg_rate
      output[prot_feat][unprot_name]['rate_diff'] = rate_diff

  return output

# Custom Wrapper

In [None]:
# base classifier
from sklearn.linear_model import LogisticRegression

class UnbiasedRegression():
  base_model = None
  fem_civ_w = 0.38

  def __init__(self, random_state):
     self.base_model = LogisticRegression(random_state=random_state)

  def fit(self, X_train, y_train):
    self.base_model.fit(X_train, y_train)

  def predict(self, X_test_scaled, X_test):
    proba = self.base_model.predict_proba(X_test_scaled)[:, 1]
    for i in range(len(X_test)):
      row = X_test.iloc[i]
      if row['Female'] and not row['URM'] and not row['Disability']:
        proba[i] = 1
    return np.clip(proba, 0, 1).round(0)

  def score(self, X_test_scaled, y_test, X_test):
    return np.mean((self.predict(X_test_scaled, X_test) == y_test).astype(int))

# Training/Results

In [None]:
trials = 1
sum_acc = 0
sum_bias = {'Female': {
              'Veteran': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0},
              'Civilian': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0}},
            'URM': {
              'Veteran': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0},
              'Civilian': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0}},
            'Disability': {
              'Veteran': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0},
              'Civilian': {'pos_rate': 0, 'neg_rate': 0, 'rate_diff': 0}}}

for r_state in range(trials):

  # preprocess
  X_train_scaled, X_test_scaled, y_train, y_test, X_train, X_test  = preprocess_inputs(dev_data, r_state)

  # train model
  model = UnbiasedRegression(random_state=r_state)
  model.fit(X_train_scaled, y_train)

  # find accuracy
  acc = model.score(X_test_scaled, y_test, X_test)
  sum_acc += acc

  # find bias
  y_pred = model.predict(X_test_scaled, X_test)
  X_test['Interview'] = y_pred
  bias = bias_loss(X_test)

  for prot_feat in ['Female', 'URM', 'Disability']:
    for vet_status, _ in [('Veteran', 1), ('Civilian', 0)]:
      for rate in ['pos_rate', 'neg_rate', 'rate_diff']:
        sum_bias[prot_feat][vet_status][rate] += bias[prot_feat][vet_status][rate]

avg_acc = sum_acc / trials * 100
print('Accuracy: {:.2f}%'.format(avg_acc))

for prot_feat in ['Female', 'URM', 'Disability']:
  for vet_status in ['Veteran', 'Civilian']:
    group = sum_bias[prot_feat][vet_status]
    pos_rate = group['pos_rate'] / trials * 100
    neg_rate = group['neg_rate'] / trials * 100
    rate_diff = group['rate_diff'] / trials * 100
    print(prot_feat, vet_status, rate, '{:.2f}% - {:.2f}% = {:.2f}%'.format(neg_rate, pos_rate, rate_diff))


Accuracy: 94.09%
Female Veteran rate_diff 90.48% - 71.70% = 18.78%
Female Civilian rate_diff 11.11% - 12.90% = -1.79%
URM Veteran rate_diff 89.47% - 72.73% = 16.75%
URM Civilian rate_diff 14.29% - 3.57% = 10.71%
Disability Veteran rate_diff 100.00% - 73.44% = 26.56%
Disability Civilian rate_diff 12.77% - 5.56% = 7.21%


# Training/Results With Dimensionality Reduction

In [None]:
n_components = 5

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=["PC" + str(i) for i in range(1, n_components + 1)])

In [None]:
X_train_reduced

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "      Decision Tree": DecisionTreeClassifier(),
    "      Random Forest": RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train_reduced, y_train)
    print(name + " trained.")

In [None]:
for name, model in models.items():
    print(name + " Accuracy: {:.2f}%".format(model.score(X_test_reduced, y_test) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/BhlR-kHxc3E