# Basic models on ynp feature set for Role Requirements

* Build the ynp 1-hot feature set from the Presto extraction
* Try logistic regression, tree, random classifier, xgboost
* SVM takes too long on this size of dataset

Why are there so many applications in the cutdown set with only one feature?

It seems believable that we can't learn much about THE SUBSET OF PEOPLE WHO ARE ACTUALLY SHORTLISTED AND REJECTED from the Role Requirements data. For the questions that are really important, it's likely that the applications will simply be ignored and left in inbox.

One way to check this would be to look at the distributions of answers to key questions (like right to work) in the general population of applicants vs the applicants with shortlisted / rejected signals.

Consider adding the job sub-class (discipline) as an additional feature. This shouldn't much help the linear models like logistic regression (at least without adding cross terms) but might well be very helpful to tree models and non-linear models.

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
import time
import xgboost as xgb

from sqlalchemy.engine import create_engine
import prestodb

In [None]:
conn=prestodb.dbapi.connect(
    host='searchquality-presto.dataservices.prod.outfra.xyz',
    port=8889,
    user='user',
    catalog='hive',
    schema='sandbox',
)
cur = conn.cursor()
cur.execute('SELECT top 100 FROM sandbox.kendra_nac_jobs')
rows = cur.fetchall()

In [None]:
# Connect to presto if we want to pick up the data directly
engine = create_engine('presto://searchquality-presto.dataservices.prod.outfra.xyz:8889/hive/default')
conn=engine.connect()

In [None]:
# Load the data from a local file extracted from Presto
# Sort by application ID in order to construct the sparse matrix correctly later
df = pd.read_csv('/home/ubuntu/data/rr/ynp_training_data.csv')
df.sort_values(by=['application_id'], inplace = True)
df = df.reset_index(drop=True)

In [None]:
# Building the sparse matrix requires us to know how many features are present in each case
# We also need the rolled up case by case outcome
r = df.groupby(['application_id','outcome'], as_index = False)['feature_id'].count()
print('There are ', len(r), ' cases in total')

In [None]:
df.head()

In [None]:
r.head()

In [None]:
r['feature_id'].value_counts()

In [None]:
r_multifeatures = r[(r.feature_id > 2) & (r.feature_id < 6)]

In [None]:
r_multifeatures['feature_id'].value_counts()

In [None]:
# Check the balance of the set
r_multifeatures.outcome.value_counts()

In [None]:
# Calculate the baseline accuracy from predicting all 'Rejected'
(len(r_multifeatures[r_multifeatures.outcome == 'Rejected']))/(len(r_multifeatures))

In [None]:
df_multifeatures = df.merge(r_multifeatures[['application_id']], 
                            left_on='application_id', 
                            right_on='application_id', 
                            how='right')

In [None]:
# Construct the sparse matrix from all cases
data    = np.ones((len(df_multifeatures),), dtype=int)
indptr  = np.concatenate([np.array([0]), np.cumsum(np.array(r_multifeatures.feature_id))])  # first element is zero, subsequent elements are indptr[i-1] + num_features in row
indices = np.array(df_multifeatures.feature_id-200000)

N = csr_matrix((data, indices, indptr))
N

In [None]:
# Split into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(N, r_multifeatures.outcome, test_size=0.20, random_state=42)

In [None]:
# Support Vector Machine
# Did not complete - likely to be extremely slow on a dataset of this size
svm_mo = svm.SVC()
svm_mo.fit(X_train, y_train)

In [None]:
# Random Forest
rf_mo = RandomForestClassifier(n_estimators=10)
rf_mo.fit(X_train, y_train)

In [None]:
# Decision Tree
t0 = time.time()

tree_mo = tree.DecisionTreeClassifier()
tree_mo.fit(X_train,y_train)

print('Time to train ', time.time() - t0)

In [None]:
# Logistic Regression
t0 = time.time()

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Time to train ', time.time() - t0)

In [None]:
logreg.score(X_test, y_test)

In [None]:
scores = logreg.predict(X_test)

In [None]:
pd.crosstab(y_test,scores)

In [None]:
y_test.value_counts()

In [None]:
unique, counts = np.unique(scores, return_counts=True)
print(np.asarray((unique, counts)).T)

In [None]:
# xgboost

train_binary_labels = (y_train == 'Shortlisted').astype('int')
test_binary_labels = (y_test == 'Shortlisted').astype('int')

dtrain = xgb.DMatrix(X_train, label = train_binary_labels)
dtest = xgb.DMatrix(X_test, label = test_binary_labels)

In [None]:
param = {'max_depth': 10, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic', 'eval':'auc'}
evallist = [(dtrain, 'train'), (dtest, 'eval')]

num_round = 2000
bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50, verbose_eval=50)

In [None]:
feature_importance = bst.get_score()

In [None]:
feature_importance['f6']

In [None]:
scores_xgb = bst.predict(dtest)

In [None]:
pd.crosstab(test_binary_labels > 0.5, scores_xgb > 0.5)

In [None]:
(45341+904268)/len(scores_xgb)