<b><font size=5>Wharton People Analytics Case Competition 2023 - 1st Place</font>
    <br>
<font size=4>Optimizing Teach For America's Matching Process</font><b>

In [867]:
from datetime import datetime
start_time = datetime.now()

import pandas as pd

import scipy.stats as sps

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.options.mode.chained_assignment = None  # default='warn'

# Creating fields

In [1]:
candidate_record = pd.read_csv("WPAC_2023_CandidateRecords_from_Tableau.csv").set_index("user_ID")

# Change to Date and Time
candidate_record["App_year"] = candidate_record["App_year"].apply(lambda x: x[-4:])
candidate_record["App_start_date"] = pd.to_datetime(candidate_record["App_start_date"])
candidate_record["App_submit_date"] = pd.to_datetime(candidate_record["App_submit_date"])
candidate_record["ComfirmOffer_date"] = pd.to_datetime(candidate_record["ComfirmOffer_date"])

candidate_record["App_start_month"] = candidate_record["App_start_date"].dt.month
candidate_record["App_submit_month"] = candidate_record["App_submit_date"].dt.month
candidate_record["ComfirmOffer_month"] = candidate_record["ComfirmOffer_date"].dt.month


def application_round(app_submit_date):
    
    submit_day_of_year = pd.Period(app_submit_date, freq='D').day_of_year
    
    first_day_of_year = pd.Period('2022-09-02', freq='D').day_of_year
    second_day_of_year = pd.Period('2022-10-21', freq='D').day_of_year
    third_day_of_year = pd.Period('2023-02-10', freq='D').day_of_year
    # early_day_of_year = pd.Period('2023-04-19', freq='D').day_of_year
    early_day_of_year = pd.Period('2023-06-01', freq='D').day_of_year # I change to Jun 01 Just in case
    
    if (early_day_of_year < submit_day_of_year <= first_day_of_year):
        return 1
    elif (first_day_of_year < submit_day_of_year <= second_day_of_year):
        return 2
    elif ((second_day_of_year < submit_day_of_year) or (submit_day_of_year <= third_day_of_year)):
        return 3
    elif (third_day_of_year < submit_day_of_year <= early_day_of_year):
        return 4
    else:
        return np.nan

candidate_record["App_start_round"] = candidate_record["App_start_date"].apply(application_round) # for comparison
candidate_record["App_submit_round"] = candidate_record["App_submit_date"].apply(application_round)
candidate_record["App_same_round_done"] = (candidate_record["App_start_round"] == candidate_record["App_submit_round"])
candidate_record.sample(10)

# Functions

## Feature Engineering

In [657]:
def target_encoding_categorical(X, column, target,
                                target_ready = False, target_mean = False):

    X.loc[:, [column]] = X[column].fillna("NA")
    if target_ready == False:
        target_mean = X.groupby(column)[target].mean()
        print(target_mean.sort_values())
        print()
        
    X.loc[:, [column]] = X[column].map(target_mean)
    
    return target_mean

In [793]:
def droping_fields(X):

        # drop_unnecessary
        X = X.drop(['App_start_date', 'App_submit_date',
                    'ComfirmOffer_date', 'Incurred_cost_year',
                    'UG_school_selectivity', 'LIC_served_rating'], # There are one as numeric e.g. UG_school_selectivity_rank
                   axis=1, errors='ignore')

        # drop_numeric_unnecessary
        X = X.drop(['App_year', 'Days_before_confirmed',
                    'Match_region_incurred_cost',
                    'Teach_region_incurred_cost'], axis=1, errors='ignore')

        # drop_preference
        X = X.drop(['Preference_Region_preference', 'Preference_Region_rank',
                    'Preferences_Subject_level', 'Preference_Region_level',
                    'Match_region', 'Teach_region', 'Teach_subject',
                    'Major_2_from_Sara', 'Major_2_from_Sara_numeric',
                    'UG_school_name', 'ComfirmOffer_month'], axis=1, errors='ignore')
        
        # drop_progress
        X = X.drop(['Progress_1_Invite_Intrvw', 'Progress_2_Complete_Intrvw',
                    'Progress_3_Accepted_toCorp', 'Progress_4_Comfirm_Offer',
                    'Progress_5_Start_1stDay', 'Progress_6_Complete_2yrs',
                    'Progress_6_Complete_2yrs_modified'], axis=1, errors='ignore')
       
        # Booleans
        def convert_to_bool(X, column):
            try:
                X[column + '_missing'] = X[column].isnull()
                X[column] = X[column].astype('bool')
            except:
                pass
            return X

        X = convert_to_bool(X, 'UG_sports')
        X = convert_to_bool(X, 'LIC_served_self_report')
        X = convert_to_bool(X, 'Leadership_role')
        X = convert_to_bool(X, 'family_responsibility')
        
        # Ordinal
        def convert_to_numeric(X, column):
            try:
                X[column + '_missing'] = X[column].isnull()
                imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
                imputer = imputer.fit(X[column].values.reshape(-1,1))  
                X[column] = imputer.transform(X[column].values.reshape(-1,1))
            except:
                pass
            return X

        X = convert_to_numeric(X, 'Days before submit')
        X = convert_to_numeric(X, 'UG_GPA')
        X = convert_to_numeric(X, 'App_start_month')
        X = convert_to_numeric(X, 'App_submit_month')
        
        X = convert_to_numeric(X, 'SelectionDimension_1')
        X = convert_to_numeric(X, 'SelectionDimension_2')
        X = convert_to_numeric(X, 'SelectionDimension_3')
        X = convert_to_numeric(X, 'SelectionDimension_4')
        X = convert_to_numeric(X, 'Selection_Dimension_Total')

        # Ordinal
        def convert_to_ordinal(X, column):
            X = convert_to_numeric(X, column)
            try:
                X = pd.concat([X, pd.get_dummies(X[[column]].astype(str))], axis=1)
            except:
                pass
            return X

        X = convert_to_ordinal(X, 'LIC_served_rating_rank')
        X = convert_to_ordinal(X, 'UG_school_selectivity_rank')
        
        X = convert_to_ordinal(X, 'App_submit_round')
        X = convert_to_ordinal(X, 'App_start_round')
        
        X = pd.get_dummies(X)

        return X

In [794]:
def separate_dataset(dataset = candidate_record,
                     first_process = "Progress_3_Accepted_toCorp",
                     target = "Progress_6_Complete_2yrs_modified"):
    
    # Do not predict null y
    dataset_filtered = dataset[~dataset[target].isnull()]

    # Separated X_in and X_out
    if first_process == "":
        X_in = dataset_filtered.copy()
        X_out = dataset_filtered.copy()
    else:
        X_in = dataset_filtered[dataset_filtered[first_process] == True]
        X_out = dataset_filtered[dataset_filtered[first_process] != True]

    y_in = X_in[target].copy().astype('int')
    y_out = X_out[target].copy().astype('int') # Not necessary
    
    # Separated train and test
    X_train, X_test, y_train, y_test = train_test_split(X_in, y_in, test_size=0.3, random_state=1022)
    
    # One hot encoding
    def category_getting_dummies(X):
        try: 
            X = pd.concat([X, pd.get_dummies(X[['UG_major_bySchool']])], axis=1)
        except:
            pass
        
        try: 
            X = pd.concat([X, pd.get_dummies(X[['career_level']])], axis=1)
        except:
            pass

        return X

    X_train = category_getting_dummies(X_train)
    X_test = category_getting_dummies(X_test)
    X_out = category_getting_dummies(X_out)

    # Target encoding
    def target_encoding_train_test_out(category):
        try:
            target_mean = target_encoding_categorical(X_train, category, target).copy() # Get target from train

            target_encoding_categorical(X_test, category, target,
                                        target_ready=True,
                                        target_mean=target_mean)

            target_encoding_categorical(X_out, category, target,
                                        target_ready=True,
                                        target_mean=target_mean)
        except:
            pass
        
    target_encoding_train_test_out('UG_major_byGroup')
    target_encoding_train_test_out('career_level')
    target_encoding_train_test_out('UG_major_bySchool')
    
    # droping_fields
    X_train = droping_fields(X_train)
    X_test = droping_fields(X_test)
    X_out = droping_fields(X_out)
    print(X_train.dtypes, end='\n\n')

    return {'X_in': X_in,
            'y_in': y_in,
            'X_train': X_train,
            'y_train': y_train,
            'X_test': X_test,
            'y_test': y_test,
            'X_out': X_out,
            'y_out': y_out}

## Decision Tree

In [660]:
from scipy.stats import ks_2samp
from sklearn.metrics import roc_auc_score
def evaluate_ks_and_roc_auc(y_real, y_proba):
    # Unite both visions to be able to filter
    df = pd.DataFrame()
    df['real'] = y_real
    df['proba'] = y_proba[:, 1]
    
    # Recover each class
    class0 = df[df['real'] == 0]
    class1 = df[df['real'] == 1]
    
    ks = ks_2samp(class0['proba'], class1['proba'])
    roc_auc = roc_auc_score(df['real'] , df['proba'])

    return ks, roc_auc

In [767]:
def decision_tree_execute(dataset, min_split=120, min_leaf=60, min_impurity=0,
                          output_type = 'tree',
                          print_something = True):

    # Decision Tree
    decision_tree = DecisionTreeClassifier(max_depth=1000000, random_state=1022
                                           ,min_impurity_decrease = min_impurity
                                          ,min_samples_split = min_split
                                          ,min_samples_leaf = min_leaf
                                          )
    decision_tree.fit(dataset['X_train'], dataset['y_train'])

    
    # Accuracy
    predictions_train = decision_tree.predict_proba(dataset['X_train'])
    actual_train = dataset['y_train'].values
    accuracy_train = sum(np.round(predictions_train[:,1]) == actual_train) / len(actual_train)
    
    predictions_test = decision_tree.predict_proba(dataset['X_test'])
    actual_test = dataset['y_test'].values
    accuracy_test = sum(np.round(predictions_test[:,1]) == actual_test) / len(actual_test)
    
    predictions_out = decision_tree.predict_proba(dataset['X_out'])

    
    # KS
    ks_train, auc_train = evaluate_ks_and_roc_auc(actual_train, predictions_train)
    ks_test, auc_test = evaluate_ks_and_roc_auc(actual_test, predictions_test)
    
    
    if print_something == True:

        print(f"prediction_accuracy from X_train: {accuracy_train}")
        print("Train classifier - ", end='')
        print(f"KS: {ks_train.statistic:.4f} (p-value: {ks_train.pvalue:.3e})", end=' ')
        print(f"ROC AUC: {auc_train:.4f}")
        print()
        
        print(f"prediction_accuracy from X_test: {accuracy_test}")
        print("Test classifier - ", end='')
        print(f"KS: {ks_test.statistic:.4f} (p-value: {ks_test.pvalue:.3e})", end=' ')
        print(f"ROC AUC: {auc_test:.4f}")
        print()
    
        # Percent True
        percent_true_train = sum(np.round(actual_train)) / len(actual_train)
        print(f"percent_true from X_train: {percent_true_train} (real)")

        percent_true_test = sum(np.round(actual_test)) / len(actual_test)
        print(f"percent_true from X_test: {percent_true_test} (real)")

        percent_true_out = sum(predictions_out[:,1]) / len(predictions_out[:,1])
        print(f"percent_true from X_out: {percent_true_out} (predicted)")
        
        print()
        print(tree.export_text(decision_tree, feature_names = list(dataset['X_train'].columns), show_weights=True))
    
    if output_type == 'auc': return auc_test
    else: return decision_tree

# Code Execution

## From progress 3 to progress 6

In [863]:
def dropping_columns(df):
    df = df.drop(['UG_major_byGroup',
                  'App_start_month', 'App_submit_month'], axis=1)
    return df

df_all = candidate_record.copy()
df_all = df_all.drop(['SelectionDimension_1','SelectionDimension_2', 'SelectionDimension_3',
                      'SelectionDimension_4', 'Selection_Dimension_Total'], axis=1, errors='ignore')

# Someone is In Progress. I will assume that they are true.
df_all['Progress_6_Complete_2yrs_modified'] = \
    df_all['Progress_6_Complete_2yrs_modified'].apply(lambda i: True if i != False else False)
df_all = dropping_columns(df_all)

In [789]:
all_3_6 = separate_dataset(df_all,
                           first_process = "Progress_3_Accepted_toCorp",
                           target = "Progress_6_Complete_2yrs_modified")
print(all_3_6['X_train'].columns)

career_level
Co-terming Senior    0.300000
Grad Student         0.423450
Junior               0.442159
Professional         0.496983
Undergrad            0.517677
Name: Progress_6_Complete_2yrs_modified, dtype: float64

UG_major_bySchool
Engineering                    0.383562
Math & Technology              0.409168
Business                       0.444896
Arts                           0.451768
Religion                       0.463636
Policy, Law, Communications    0.485459
Other                          0.492188
Education                      0.492296
Physical Sciences              0.492769
Languages                      0.499127
Health sciences                0.500000
Humanities                     0.505711
Social Science                 0.510076
Physchology / Mental Health    0.541476
NA                             0.582222
Name: Progress_6_Complete_2yrs_modified, dtype: float64

COVID19_online_period            bool
Days before submit            float64
career_level                 

### KS (for informational purpose only)

In [896]:
X_train = all_3_6['X_train'].copy()
y_train = all_3_6['y_train'].copy()

goods = X_train[y_train==0].copy()
bads = X_train[y_train==1].copy()

In [912]:
filter_score = pd.DataFrame(np.zeros((len(X_train.columns)+1,3)))
filter_score.columns = ['variable','filter score','p-value']

j = 0
for column in list(X_train.columns):
    
    filter_score.loc[j,'variable'] = column
    KS_result = sps.ks_2samp(goods[column],bads[column])
    filter_score.loc[j,'filter score'] = KS_result[0]
    filter_score.loc[j,'p-value'] = KS_result[1]
    j = j+1
    
filter_score.sort_values(by=['filter score'], ascending=False, inplace=True)
filter_score.to_csv('1_filter_score_all.csv', index=False)

In [913]:
filter_score.head(30)

Unnamed: 0,variable,filter score,p-value
1,Days before submit,0.095905,4.580672e-44
6,UG_GPA,0.064152,6.170844999999999e-20
12,App_start_round,0.053364,6.334775e-14
59,App_start_round_4.0,0.053364,6.334775e-14
2,career_level,0.04964,4.172776e-12
33,career_level_Undergrad,0.047502,4.025811e-11
13,App_submit_round,0.046961,7.031754e-11
54,App_submit_round_4.0,0.046961,7.031754e-11
4,UG_major_bySchool,0.045217,4.062338e-10
14,App_same_round_done,0.044115,1.189369e-09


### Decision Tree

In [790]:
decision_tree_all_3_6 = decision_tree_execute(all_3_6, min_impurity=0.0003) # either 1000 or 3000

prediction_accuracy from X_train: 0.5553159271325655
Train classifier - KS: 0.1135 (p-value: 1.247e-61) ROC AUC: 0.5813

prediction_accuracy from X_test: 0.5513328337437106
Test classifier - KS: 0.1145 (p-value: 2.220e-16) ROC AUC: 0.5730

percent_true from X_train: 0.49497545083283623 (real)
percent_true from X_test: 0.48817043143132427 (real)
percent_true from X_out: 0.47393492260986536 (predicted)

|--- Days before submit <= 1.50
|   |--- UG_major_bySchool <= 0.45
|   |   |--- UG_school_selectivity_rank <= 3.50
|   |   |   |--- weights: [133.00, 106.00] class: 0
|   |   |--- UG_school_selectivity_rank >  3.50
|   |   |   |--- weights: [430.00, 181.00] class: 0
|   |--- UG_major_bySchool >  0.45
|   |   |--- UG_GPA <= 3.65
|   |   |   |--- weights: [1721.00, 1392.00] class: 0
|   |   |--- UG_GPA >  3.65
|   |   |   |--- weights: [1146.00, 720.00] class: 0
|--- Days before submit >  1.50
|   |--- UG_GPA <= 3.50
|   |   |--- career_level <= 0.47
|   |   |   |--- weights: [639.00, 630.0