# Basic models on ynp feature set for Role Requirements

* Build the ynp 1-hot feature set from the Presto extraction
* Try logistic regression, tree, random classifier, xgboost
* SVM takes too long on this size of dataset

In [12]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
import time
import xgboost as xgb

#from sqlalchemy.engine import create_engine

In [None]:
# Connect to presto if we want to pick up the data directly
engine = create_engine('presto://searchquality-presto.dataservices.prod.outfra.xyz:8889/hive/default')
conn=engine.connect()

In [2]:
# Load the data from a local file extracted from Presto
# Sort by application ID in order to construct the sparse matrix correctly later
df = pd.read_csv('/home/ubuntu/data/rr/ynp_training_data.csv')
df.sort_values(by=['application_id'], inplace = True)
df = df.reset_index(drop=True)

In [4]:
# Building the sparse matrix requires us to know how many features are present in each case
# We also need the rolled up case by case outcome
r = df.groupby(['application_id','outcome'], as_index = False)['feature_id'].count()
print('There are ', len(r), ' cases in total')

8472958


In [3]:
df.head()

Unnamed: 0,application_id,job_id,prospect_id,listing_id,outcome,feature_id
0,600663296,30635329,81273229,2332144,Rejected,200001
1,600663296,30635329,81273229,2332144,Rejected,200396
2,600663296,30635329,81273229,2332144,Rejected,201482
3,600705149,30635329,81302286,2332144,Rejected,201482
4,600705149,30635329,81302286,2332144,Rejected,200001


In [5]:
r.head()

Unnamed: 0,application_id,outcome,feature_id
0,600663296,Rejected,3
1,600705149,Rejected,3
2,601056650,Rejected,3
3,601111513,Rejected,3
4,601227968,Shortlisted,3


In [6]:
# Check the balance of the set
r.outcome.value_counts()

Rejected       5703940
Shortlisted    2769018
Name: outcome, dtype: int64

In [7]:
# Calculate the baseline accuracy from predicting all 'Rejected'
5703940/(len(r))

0.6731934703323208

In [8]:
# Construct the sparse matrix from all cases
data    = np.ones((len(df),), dtype=int)
indptr  = np.concatenate([np.array([0]), np.cumsum(np.array(r.feature_id))])  # first element is zero, subsequent elements are indptr[i-1] + num_features in row
indices = np.array(df.feature_id-200000)

N = csr_matrix((data, indices, indptr))
N

<8472958x2496 sparse matrix of type '<class 'numpy.int64'>'
	with 32581526 stored elements in Compressed Sparse Row format>

In [9]:
# Quick and nasty split into training and testing sets
X_train = N[0:len(r.feature_id)-500001]
y_train = r.outcome.iloc[0:len(r.feature_id)-500001]

X_test = N[(len(r.feature_id)-500001):-1]
y_test = r.outcome.iloc[(len(r.feature_id)-500001):-1]

In [None]:
# Support Vector Machine
# Did not complete - likely to be extremely slow on a dataset of this size
svm_mo = svm.SVC()
svm_mo.fit(X_train, y_train)

In [None]:
# Random Forest
rf_mo = RandomForestClassifier(n_estimators=10)
rf_mo.fit(X_train, y_train)

In [25]:
# Decision Tree
t0 = time.time()

tree_mo = tree.DecisionTreeClassifier()
tree_mo.fit(X_train[0:4000000],y_train[0:4000000])

print('Time to train ', time.time() - t0)

Time to train  892.9278666973114


In [22]:
# Logistic Regression
t0 = time.time()

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print('Time to train ', time.time() - t0)

Time to train  338.7265374660492


In [None]:
# xgboost

train_binary_labels = (r.outcome.iloc[0:len(r.feature_id)-500001] == 'Shortlisted').astype('int')
test_binary_labels = (r.outcome.iloc[(len(r.feature_id)-500001):-1] == 'Shortlisted').astype('int')

dtrain = xgb.DMatrix(N[0:len(r.feature_id)-500001], label = train_binary_labels)
dtest = xgb.DMatrix(N[(len(r.feature_id)-500001):-1], label = test_binary_labels)


param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic', 'eval':'auc'}
evallist = [(dtest, 'eval'), (dtrain, 'train')]

num_round = 800
bst = xgb.train(param, dtrain, num_round, evallist, verbose_eval=20)

In [26]:
tree_mo.score(X_test, y_test)

0.65631799999999996

In [27]:
score = tree_mo.predict(X_test)
score

array(['Rejected', 'Rejected', 'Rejected', ..., 'Rejected', 'Rejected',
       'Rejected'], dtype=object)

In [28]:
unique_elements, counts_elements = np.unique(score, return_counts=True)

In [29]:
unique_elements

array(['Rejected', 'Shortlisted'], dtype=object)

In [30]:
counts_elements

array([430819,  69181])

In [34]:
430819/(430819+69181)

0.861638

In [32]:
y_test.value_counts()

Rejected       344524
Shortlisted    155476
Name: outcome, dtype: int64

In [33]:
1-155476/(344524+155476)

0.689048

In [36]:
pd.crosstab(y_test,score)

col_0,Rejected,Shortlisted
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1
Rejected,301751,42773
Shortlisted,129068,26408
