# Basic models on ynp feature set for Role Requirements

* Build the ynp 1-hot feature set from the Presto extraction
* Try logistic regression, tree, random classifier, xgboost
* SVM takes too long on this size of dataset

Why are there so many applications in the cutdown set with only one feature?

It seems believable that we can't learn much about THE SUBSET OF PEOPLE WHO ARE ACTUALLY SHORTLISTED AND REJECTED from the Role Requirements data. For the questions that are really important, it's likely that the applications will simply be ignored and left in inbox.

One way to check this would be to look at the distributions of answers to key questions (like right to work) in the general population of applicants vs the applicants with shortlisted / rejected signals.

Consider adding the job sub-class (discipline) as an additional feature. This shouldn't much help the linear models like logistic regression (at least without adding cross terms) but might well be very helpful to tree models and non-linear models.

In [53]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
import time
import xgboost as xgb
import operator
#import lightgbm

#from sqlalchemy.engine import create_engine
#import prestodb

In [None]:
conn=prestodb.dbapi.connect(
    host='searchquality-presto.dataservices.prod.outfra.xyz',
    port=8889,
    user='user',
    catalog='hive',
    schema='sandbox',
)
cur = conn.cursor()
cur.execute('SELECT top 100 FROM sandbox.kendra_nac_jobs')
rows = cur.fetchall()

In [None]:
# Connect to presto if we want to pick up the data directly
engine = create_engine('presto://searchquality-presto.dataservices.prod.outfra.xyz:8889/hive/default')
conn=engine.connect()

In [2]:
# Load the data from a local file extracted from Presto
# Sort by application ID in order to construct the sparse matrix correctly later
#df = pd.read_csv('/home/ubuntu/data/rr/ynp_training_data.csv')
df = pd.read_csv('/Users/kvant/src/data/rr/ynp_augmented_training_data_more_than_ten.csv')
df.sort_values(by=['application_id'], inplace = True)
df = df.reset_index(drop=True)

In [3]:
# Building the sparse matrix requires us to know how many features are present in each case
# We also need the rolled up case by case outcome
r = df.groupby(['application_id','outcome'], as_index = False)['feature_id'].count()
print('There are ', len(r), ' cases in total')

There are  5337618  cases in total


In [4]:
df.head()

Unnamed: 0,application_id,outcome,feature_id
0,600663296,Rejected,200306
1,600663296,Rejected,400242
2,600663296,Rejected,201195
3,600663296,Rejected,200002
4,600663296,Rejected,501002


In [5]:
r.head()

Unnamed: 0,application_id,outcome,feature_id
0,600663296,Rejected,6
1,600705149,Rejected,6
2,601056650,Rejected,6
3,601111513,Rejected,6
4,601227968,Shortlisted,6


In [6]:
r['feature_id'].value_counts()

8     2438859
7     1105507
6      854224
5      549687
4      389303
9          36
10          2
Name: feature_id, dtype: int64

In [7]:
r_multifeatures = r[(r.feature_id > 5) & (r.feature_id < 9)]

In [8]:
r_multifeatures['feature_id'].value_counts()

8    2438859
7    1105507
6     854224
Name: feature_id, dtype: int64

In [9]:
# Check the balance of the set
r_multifeatures.outcome.value_counts()

Rejected       2954194
Shortlisted    1444396
Name: outcome, dtype: int64

In [10]:
# Calculate the baseline accuracy from predicting all 'Rejected'
(len(r_multifeatures[r_multifeatures.outcome == 'Rejected']))/(len(r_multifeatures))

0.6716229519004954

In [11]:
df_multifeatures = df.merge(r_multifeatures[['application_id']], 
                            left_on='application_id', 
                            right_on='application_id', 
                            how='right')

In [12]:
# Construct the sparse matrix from all cases
data    = np.ones((len(df_multifeatures),), dtype=int)
indptr  = np.concatenate([np.array([0]), np.cumsum(np.array(r_multifeatures.feature_id))])  # first element is zero, subsequent elements are indptr[i-1] + num_features in row
indices = np.array(df_multifeatures.feature_id)#-200000)

N = csr_matrix((data, indices, indptr))
N

<4398590x505132 sparse matrix of type '<class 'numpy.int64'>'
	with 32374765 stored elements in Compressed Sparse Row format>

In [13]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(N, r_multifeatures.outcome, test_size=0.20, random_state=42)

In [None]:
# Support Vector Machine
# Did not complete - likely to be extremely slow on a dataset of this size
svm_mo = svm.SVC()
svm_mo.fit(X_train, y_train)

In [None]:
# Random Forest
rf_mo = RandomForestClassifier(n_estimators=10)
rf_mo.fit(X_train, y_train)

In [None]:
# Decision Tree
t0 = time.time()

tree_mo = tree.DecisionTreeClassifier()
tree_mo.fit(X_train,y_train)

print('Time to train ', time.time() - t0)

In [14]:
# Logistic Regression
t0 = time.time()

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Time to train ', time.time() - t0)

Time to train  161.37361097335815


In [15]:
logreg.score(X_test, y_test)

0.67539484243814496

In [16]:
scores = logreg.predict(X_test)

In [17]:
pd.crosstab(y_test,scores)

col_0,Rejected,Shortlisted
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
Rejected,574991,16536
Shortlisted,269025,19166


In [18]:
y_test.value_counts()

Rejected       591527
Shortlisted    288191
Name: outcome, dtype: int64

In [19]:
unique, counts = np.unique(scores, return_counts=True)
print(np.asarray((unique, counts)).T)

[['Rejected' 844016]
 ['Shortlisted' 35702]]


In [20]:
# xgboost
train_binary_labels = (y_train == 'Shortlisted').astype('int')
test_binary_labels = (y_test == 'Shortlisted').astype('int')

dtrain = xgb.DMatrix(X_train, label = train_binary_labels)
dtest = xgb.DMatrix(X_test, label = test_binary_labels)

In [57]:
param = {'max_depth': 15, 'eta': 0.3, 'silent': 1, 'objective': 'binary:logistic', 'eval':'auc'}
evallist = [(dtrain, 'train'), (dtest, 'eval')]

num_round = 200
bst3 = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=50, xgb_model=bst2, verbose_eval=50)

[0]	train-error:0.294867	eval-error:0.309023
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 50 rounds.
[50]	train-error:0.293547	eval-error:0.308568
[100]	train-error:0.292569	eval-error:0.308354
[150]	train-error:0.291615	eval-error:0.308007
[199]	train-error:0.290838	eval-error:0.307866


In [None]:
[0]	train-error:0.326851	eval-error:0.326723
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 50 rounds.
[50]	train-error:0.313453	eval-error:0.317007
[100]	train-error:0.308085	eval-error:0.314025
[150]	train-error:0.30436	eval-error:0.312499
[199]	train-error:0.301722	eval-error:0.311078
        
[0]	train-error:0.301689	eval-error:0.311058
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 50 rounds.
[50]	train-error:0.299649	eval-error:0.310452
[100]	train-error:0.298047	eval-error:0.309939
[150]	train-error:0.296255	eval-error:0.309356
[199]	train-error:0.294878	eval-error:0.309024

In [None]:
[0]	train-error:0.309788	eval-error:0.314735
Multiple eval metrics have been passed: 'eval-error' will be used for early stopping.

Will train until eval-error hasn't improved in 50 rounds.
[50]	train-error:0.309138	eval-error:0.314244
[100]	train-error:0.308783	eval-error:0.314104
[150]	train-error:0.308464	eval-error:0.313918
[200]	train-error:0.308163	eval-error:0.313818
[250]	train-error:0.307858	eval-error:0.313618
[300]	train-error:0.307436	eval-error:0.313367
[350]	train-error:0.307205	eval-error:0.313264
[400]	train-error:0.306838	eval-error:0.313062
[450]	train-error:0.306522	eval-error:0.312898
[500]	train-error:0.306331	eval-error:0.312847
[550]	train-error:0.305972	eval-error:0.312581
[600]	train-error:0.305791	eval-error:0.312565
[650]	train-error:0.305633	eval-error:0.3125
[700]	train-error:0.30534	eval-error:0.312435
[750]	train-error:0.304895	eval-error:0.312116
[800]	train-error:0.304714	eval-error:0.312064
[850]	train-error:0.304506	eval-error:0.311954
[900]	train-error:0.304286	eval-error:0.311884
[950]	train-error:0.303947	eval-error:0.311729
[999]	train-error:0.303727	eval-error:0.311643

In [58]:
feature_importance = bst3.get_score()

In [59]:
#feature_importance#['f200006']
sorted_feature_importance = sorted(feature_importance.items(), key=operator.itemgetter(1), reverse = True)
sorted_feature_importance

[('f201584', 11402),
 ('f201195', 10235),
 ('f200313', 8902),
 ('f201585', 6623),
 ('f400242', 6181),
 ('f201194', 6003),
 ('f501002', 5514),
 ('f200001', 5370),
 ('f200312', 4760),
 ('f501000', 4757),
 ('f201196', 4533),
 ('f200311', 4399),
 ('f501004', 4151),
 ('f200813', 4146),
 ('f200781', 3713),
 ('f501009', 3541),
 ('f200368', 2892),
 ('f400243', 2823),
 ('f200367', 2240),
 ('f501007', 2236),
 ('f400245', 2150),
 ('f200504', 1862),
 ('f501005', 1675),
 ('f200724', 1656),
 ('f400244', 1610),
 ('f200570', 1553),
 ('f200334', 1501),
 ('f200438', 1453),
 ('f200780', 1384),
 ('f200004', 1325),
 ('f200003', 1316),
 ('f201428', 1307),
 ('f200640', 1225),
 ('f200359', 1216),
 ('f501018', 1190),
 ('f200360', 1141),
 ('f200006', 1140),
 ('f200333', 1131),
 ('f200730', 1096),
 ('f505045', 1078),
 ('f200723', 986),
 ('f200842', 895),
 ('f200916', 844),
 ('f201009', 842),
 ('f200645', 796),
 ('f200502', 769),
 ('f505002', 727),
 ('f501015', 720),
 ('f200382', 706),
 ('f306252', 658),
 ('f2004

In [41]:
scores_xgb = bst3.predict(dtest)

In [65]:
pd.crosstab(test_binary_labels > 0.5, scores_xgb > 0.35)

col_0,False,True
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
False,385734,205793
True,123788,164403


In [37]:
(29186+573650)/len(scores_xgb)

0.6852605039342153

In [51]:
(65552+539418)/len(scores2_xgb)

0.6876862812855938

In [66]:
(164403+385734)/len(scores_xgb)

0.6253560800165507

In [54]:
bst.save_model('/Users/kvant/src/data/rr/xgboost_1000_trees.boost')
bst2.save_model('/Users/kvant/src/data/rr/xgboost_2000_trees.boost')