
# Notebook Summary

## Introduction

- Build a confidence model that that tells us how confident we are that any match is correct.
   - It predicts probablity that Standard-SOC is a match.
- Output data with probability for all matches for use.


## What is in this notebook

- Read in training data from `02_build_and_plot_features.ipynb`
   - The two parts of the version number referes to which output to read.
- Build a Random Forest model that predicts at a row level if SOC matches
- Check peformance
   - Calibration
   - Performance (precision vs. recall)
- Make predictions for all matches (using cross-fold to avoid bias)
      
## Output/Results

- File in format `f'nsfg_data/df_output_modelling_{run_version}.csv'`
 - This output data which will be used in next stage to create a product.






# Notebook setup

## Constants

In [None]:
run_version = 'v4.2.2' 
# .2 include core and options as feature
# .1 Just model B, Include standards that don't have label in output

# Get everything before 2nd .
data_version = '.'.join(run_version.split('.')[0:2])

label = 'autoassign_in_top_3' # What to use as target variable

do_tuning = True

## Regular Imports

In [None]:
### Imports 
import os
import sys
import numpy as np
from pathlib import Path
import datetime

import warnings
warnings.simplefilter(action='ignore')

import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

## My Imports

In [None]:
# Add Path of where you have imported my functions
current_path = os.getcwd()
functions_path = Path('..', 'Functions')
sys.path.append(str(functions_path))

In [None]:
## Pandas functions
import laurie_pandas_functions as pd_funcs
from laurie_pandas_functions import display_full

## Matplotlib funcs
import laurie_plotting_functions as plot_funcs
from laurie_plotting_functions import get_ax, force_ax_grid

In [None]:
## Useful when developing your functions
from importlib import reload  
reload(pd_funcs)
reload(plot_funcs)

In [None]:
def run_ls_on_path(path):
    """
    Run ls on a path in jupyter and display to notebook
    Can't be imported as uses cell magic
    Args: path (pathlib.WindowsPath): path created by pathlib
    """
    userhome = os.path.expanduser("~")
    reformatted_path = ('\"' + str(path).replace('\\\\', '\"/\"') + '\"').replace('\"~\"','~').replace('~', userhome)
    print(f'$ ls {path}')
    !ls {reformatted_path}
    print('\n')

## Pandas/Plotting

In [None]:
### Colours
blue =  '#79a5f7'
red  =  '#ff9696'
green=  '#9ebd9e'
sns_colours = sns.color_palette()

### Plot Size (supersize_me)
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'large',
          'axes.labelsize': 'large',
          'axes.titlesize':'large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large',
          'figure.titlesize':'x-large'}
pylab.rcParams.update(params)

## ML Stuff

In [None]:
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn import metrics

# Get Data

In [None]:
df_training_raw = pd.read_csv(f'nsfg_data/df_train_w_features__{data_version}.csv')

In [None]:
df_training_raw['standard_code'].nunique()

## Fill NAs

In [None]:
count_nulls = df_training_raw.isna().sum()

In [None]:
count_nulls[count_nulls > 0]

In [None]:
fill_na_minus_1 = [
    'soc_job_matches_standard_title',
    'standard_typical_job_title',
    'soc_job_matches_typical_job',
    'score_soc_job_match_typical_job',
    'score_soc_job_match_typical_job_minus_alt_1',
    'score_soc_job_match_typical_job_minus_alt_2',        
]

df_training_no_nulls = df_training_raw.copy()
for col in fill_na_minus_1:
    df_training_no_nulls[col] = df_training_no_nulls[col].fillna(-1)

## Remove Those with out labels

In [None]:
df_training_no_nulls['n_labels_for_ref_no'] = (
    df_training_no_nulls
    .groupby('ref_no')
    [label]
    .transform(sum)
)

In [None]:
df_training_rm_no_labels = df_training_no_nulls.loc[lambda df: df['n_labels_for_ref_no'] > 0]
df_training_no_labels = df_training_no_nulls.loc[lambda df: df['n_labels_for_ref_no'] == 0]

## Get Train

In [None]:
df_training = df_training_rm_no_labels
df_training_top_ranks = df_training.loc[lambda df: df['score_rank'] <= 50]
df_training_not_top_ranks = df_training.loc[lambda df: df['score_rank'] > 50]

# Modelling Set-up

## Functions

In [None]:
def get_probabilites(model, X, class_index=1):
    """
    Get probabilities that model thinks item is in class_index for features X
    
    Args:
        model: scikit-learn classifier
        X (pd.df): DF containing features for model
        class_index (int): Which class are we predicting, default = 1
        
    Returns:
        y (np.array): List of probabilites that model thinks item is in class_index
    """
    return model.predict_proba(X)[:,class_index]

In [None]:
def plot_validation_and_test(data, x, ax, metric='log_loss', x_seperators=None):
    """
    Plot line plots of balanced_accuracy for test and train
    """
    for test_set in ['train', 'test']:
        y = f'{metric}_{test_set}'
        sns.lineplot(data=data, x=x, y=y, ax=ax, label=test_set.title())
    
    ax.set(ylabel = 'Balanced Accuracy')
    force_ax_grid(ax, x_seperators=x_seperators)
    ax.legend()
    return ax

## Set QA Thresholds

In [None]:
SCORE_NO_QA = 0.7  ## What % will we accept doing no QA
SCORE_LOW_QA = 0.5  ## What % will we accept a low-level QA

## Print Out Scenario Stuff

In [None]:
def print_out_scenario_two_cuts(df_scenario, prob_col, cut_off_no_qa, cut_off_low_level_qa, data_label):
    
    n_scenario = df_scenario.shape[0]

    ## No QA
    df_no_qa = df_scenario.loc[lambda df: df[prob_col] > cut_off_no_qa]  
    
    
    df_no_qa_stats = df_no_qa[label].agg(['count','sum','mean'])
    
    n_no_qa_scenario = df_no_qa_stats['count']
    pct_no_qa_scenario = n_no_qa_scenario * 100.0 / n_scenario
    n_no_qa_correct_scenario = df_no_qa_stats['sum']
    accuracy_no_qa_scenario = df_no_qa_stats['mean'] * 100.0
    
    ## Low-level QA
    df_low_level_qa = (
        df_scenario
        .loc[lambda df: df[prob_col] > cut_off_low_level_qa]    
        .loc[lambda df: df[prob_col] <= cut_off_no_qa]    
    )
    
    df_low_level_qa_stats = df_low_level_qa[label].agg(['count','sum','mean'])
    
    n_low_level_qa_scenario = df_low_level_qa_stats['count']
    pct_low_level_qa_scenario = n_low_level_qa_scenario * 100.0 / n_scenario
    n_low_level_qa_correct_scenario = df_low_level_qa_stats['sum']
    accuracy_low_level_qa_scenario = df_low_level_qa_stats['mean'] * 100.0

    ## Full QA
    df_full_qa = (
        df_scenario
        .loc[lambda df: df[prob_col] <= cut_off_low_level_qa]    
    )
    
    df_full_qa_stats = df_full_qa[label].agg(['count','sum','mean'])
    
    n_full_qa_scenario = df_full_qa_stats['count']
    pct_full_qa_scenario = n_full_qa_scenario * 100.0 / n_scenario
    n_full_qa_correct_scenario = df_full_qa_stats['sum']
    accuracy_full_qa_scenario = df_full_qa_stats['mean'] * 100.0
    
    print(f'''
    In {data_label},
    which is {n_scenario} standards.
    
    If we were not to QA anything with a probability score > {cut_off_no_qa * 100.0:,.0f}%
    And only apply low-level QA to anything with a probability score > {cut_off_low_level_qa * 100.0:,.0f}%
    
    Then we would not do QA for
    - {n_no_qa_scenario:,.0f} Standards ({pct_no_qa_scenario:.1f}%)
    - The accuracy would be {accuracy_no_qa_scenario:.1f}%
    
    Only do low-level QA for 
    - {n_low_level_qa_scenario:,.0f} Standards ({pct_low_level_qa_scenario:.1f}%)
    - The accuracy would be {accuracy_low_level_qa_scenario:.1f}%
    
    And finally do QA for 
    - {n_full_qa_scenario:,.0f} Standards ({pct_full_qa_scenario:.1f}%)
    - The accuracy would be {accuracy_full_qa_scenario:.1f}%
    ''')

In [None]:
def print_out_scenarios_two_cuts(df_predictions_model, prob_col, cut_off_no_qa, cut_off_low_level_qa):

    for i, (scenario_desc, scenario_filter) in enumerate(dict_scenario_desc_and_filter.items()):
        
        scenario_number = i + 1
        if scenario_filter:
            df_scenario = df_predictions_model.loc[scenario_filter]
        else:
            df_scenario = df_predictions_model
    
        print('\n---------------------------')
        print(f'Scenario {scenario_number}:')
        print_out_scenario_two_cuts(df_scenario, prob_col, cut_off_no_qa, cut_off_low_level_qa, scenario_desc)


## Calibration

In [None]:
def do_calibration(df_predictions, rank_col, prob_col, model_name='Model', plot_top_rank=False):
    
    df_predictions['model_output_prob_grouped'] = (df_predictions[prob_col]).round(1)
    
    df_calibration = (
        df_predictions
       .groupby('model_output_prob_grouped')
        .agg(**{
            'count': (prob_col, 'count'),
            prob_col: ( prob_col, 'mean'),
            label: (label, 'mean'),
        }
        )
        .reset_index()
    )
    
    
    df_top_predictions = df_predictions.loc[lambda df: df[rank_col]==1]
    
    df_calibration_top = (
        df_top_predictions
       .groupby('model_output_prob_grouped')
        .agg(
            {prob_col:'mean',
             label:'mean'
            }
        )
        .reset_index()
    )
    
    
    df_calibration = (
        df_calibration
        .merge(
            df_calibration_top,
            how='outer', 
            on='model_output_prob_grouped',
            suffixes=('', '__top_rank')
        )
        
    )
    
    display(
        df_calibration
        .set_index('model_output_prob_grouped')
        .style
        .format('{:,.2f}')
        .set_table_styles(pd_funcs.get_lauries_table_styles())
    )
    
    ax = get_ax(width=7, height=7)
    
    (
        df_calibration 
        .plot(x = prob_col, y = label, ax = ax, marker = 'o', linestyle = ':', label = 'Model Performance')
    )
    
    if plot_top_rank:
    
        (
            df_calibration 
            .plot(x = 'model_output_prob__top_rank', y = 'soc2020_autoassign_correct__top_rank', ax = ax, marker = 'o', linestyle = ':', label = 'Model Performance (Top Rank)')
        )
    
    
    (
        df_calibration 
        .assign(model_output_prob2 = lambda df: df[prob_col])
        .plot(x = prob_col, y = 'model_output_prob2', ax = ax, linestyle = '--', color = 'dimgrey', zorder = 0, label = 'Ideal')
    )
    
    
    force_ax_grid(ax, x_seperators=0.1, y_seperators=0.1)
    ax.set(xlabel = 'Probability from Model',
           ylabel = 'True Probability',
           title = 'Calibration Plot - Probability that Auto-Assign is Correct'
          )
    
    

## Plot Probs

In [None]:
dict_scenario_desc_and_filter = {
    f'Entire data-set - Cross-folding': None, ## Filter for all,
}

In [None]:
def plot_probs(df_predictions, prob_col, rank_col):
    
    dict_rank_desc_and_filter = {
        'All Ranks': lambda df: df[rank_col] >= 0,
        'Top 10': lambda df: df[rank_col] <= 5,
        'Top Rank': lambda df: df[rank_col] <= 1,
    }

    for rank_desc, rank_filter in dict_rank_desc_and_filter.items():
        for scenario_desc, scenario_filter in dict_scenario_desc_and_filter.items():
    
            print(scenario_desc, rank_desc)
            
            data = df_predictions.loc[rank_filter]
            if scenario_filter: data = data[scenario_filter]
                
            ax = get_ax()
            
            bins = np.linspace(0, 1, 21)
            
            sns.histplot(
                data=data,
                x=prob_col, hue=label,
                ax=ax, stat='percent', common_norm=False, bins=bins,
            )
            
            ax.set(title = f'{scenario_desc} - {rank_desc}'.replace('the', 'The'))
            force_ax_grid(ax)
            

## Performance Metrics

In [None]:
def get_performance_metrics_df(data, label, rank_col, prob_col=None):

    dict_performance_metrics = {}
    
    
    data['total_ranked'] = data.loc[lambda df: df[rank_col]==1][label].sum()
    
    dict_ranks_and_filters = {
        'Top Rank': data[rank_col] == 1,
        'Top 5': data[rank_col] <= 5,
        'Top 10': data[rank_col] <= 10,
        'Top 50': data[rank_col] <= 50,
    }
    
    for rank_str, rank_filter in dict_ranks_and_filters.items():
        
        dict_performance_metrics[f'Recall of {rank_str}'] = (
            (
                data
                .loc[rank_filter]
                .groupby('ref_no')
                [label]
                .sum()
            ) / (
                data
                .groupby('ref_no')
                [label]
                .sum()
            )
        ).mean()
        

        dict_performance_metrics[f'Precision of {rank_str}'] = (
            (
                data
                .loc[rank_filter]
                .groupby('ref_no')
                [label]
                .mean()
            ) 
        ).mean()

        
    
    if prob_col:
        
        dict_performance_metrics['logloss_score'] = metrics.log_loss(data[label], data[prob_col])
        
        prob_str = '> 50% Prob'
        prob_filter =  data[prob_col] >= 0.5
        
        dict_performance_metrics[f'Recall of {prob_str}'] = (
            (
                data
                .loc[prob_filter]
                .groupby('ref_no')
                [label]
                .sum()
            ) / (
                data
                .groupby('ref_no')
                [label]
                .sum()
            )
        ).mean()
        

        dict_performance_metrics[f'Precision of {prob_str}'] = (
            (
                data
                .loc[prob_filter]
                .groupby('ref_no')
                [label]
                .mean()
            ) 
        ).mean()

        
    return dict_performance_metrics

In [None]:
def display_performance_metrics_df(dict_perf_metrics):
    print('*******************')
    for metric_name, metric in dict_perf_metrics.items():
        if metric_name.startswith('Recall') or metric_name.startswith('Precision'):
            metric = f'{metric*100:.1f}%'
        else:
            metric = f'{metric:.4f}'
        
        output = f'{metric_name:25} : ' + metric
        print(output)
    print('*******************\n')

In [None]:
## No model, just use jobtitle_rank
dict_no_model_perf = get_performance_metrics_df(df_training, label=label, rank_col='score_rank')
display_performance_metrics_df(dict_no_model_perf)

# B - RF Select Features

## B - Get Data

In [None]:
df_training.columns

In [None]:
list_model_feature_cols_model_b = [
    'score_soc_job_match_standard_title',
    'score_soc_job_match_standard_title_minus_alt_1',
    'score_soc_job_match_standard_title_minus_alt_2',
    'score_soc_job_match_typical_job',
    'score_soc_job_match_typical_job_minus_alt_1',
    'score_soc_job_match_typical_job_minus_alt_2',
    'score_overview',
    'score_overview_minus_alt_1',
    'score_overview_minus_alt_2',
    'soc_2020_matches_previous_assignment',
    'soc_2020_matches_previous_assignment_minus_alt_1',
    'soc_2020_matches_previous_assignment_minus_alt_2',
    'soc_2020_ext_is_nec',
    'absolute_relative_distance_between_level_and_soc_major_group',
    'absolute_relative_distance_between_level_and_soc_major_group_minus_alt_1',
    'absolute_relative_distance_between_level_and_soc_major_group_minus_alt_2',
    'is_core_and_options'
]

In [None]:
df_training_top_ranks = df_training.loc[lambda df: df['score_rank'] <= 50]
X_model_b = df_training_top_ranks[list_model_feature_cols_model_b]
y_model_b = df_training_top_ranks[label]
groups = df_training_top_ranks['ref_no']

## B - Scan Parameters

In [None]:
%%time
if do_tuning:
    ### Lists to store data. I will create a pandas df from this.
    dict_rf_scan_min_split = {
        'n_estimators': [],
        'max_depth': [],
        'log_loss_train': [],
        'log_loss_test': [],
        'min_samples_split': [],
        'model': [],
    }
    
    X = X_model_b
    y = y_model_b
    
    ## Ranges
    n_estimators = 100
    max_depths = [5, 8, 10, 12, 15, 20]
    min_samples_split_choices = [50, 75, 100, 150, 200] 
    
    
    for max_depth in max_depths:
        for min_samples_split in min_samples_split_choices:
            
            timestamp = datetime.datetime.today().strftime('%y/%m/%d %H:%M:%S')
            print(f'{timestamp} - {max_depth:6}, {n_estimators:6}, {min_samples_split:6,.0f}')
        
            model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
            model.fit(X, y)
            
            probs_train = get_probabilites(model, X)
            log_loss_train = metrics.log_loss(y, probs_train)
            
            gkf = model_selection.GroupKFold(3)
            probs_test = model_selection.cross_val_predict(model, X, y, cv=gkf, groups=groups, method='predict_proba')
            log_loss_test = metrics.log_loss(y, probs_test)
                
            dict_rf_scan_min_split['n_estimators'].append(n_estimators)
            dict_rf_scan_min_split['max_depth'].append(max_depth)
            dict_rf_scan_min_split['log_loss_train'].append(log_loss_train)
            dict_rf_scan_min_split['log_loss_test'].append(log_loss_test)
            dict_rf_scan_min_split['min_samples_split'].append(min_samples_split)
            dict_rf_scan_min_split['model'].append(model)
            
    ## C - Store results in a df
    df_scan_model_b = pd.DataFrame(dict_rf_scan_min_split)
    df_scan_model_b['min_samples_split_str'] = df_scan_model_b['min_samples_split'].astype(str)

## B - Pick Parameter

### Just see test

In [None]:
if do_tuning:
    ax = get_ax()
    
    sns.lineplot(
        data = df_scan_model_b,
        x = 'min_samples_split_str', y = 'log_loss_test', ax = ax,
        hue = 'max_depth', marker = 'o'
    )
    force_ax_grid(ax)
    ax.set(ylabel = 'LogLoss', xlabel = 'Min Samples Split')
    
    ax = get_ax()
    
    sns.lineplot(
        data = df_scan_model_b,
        x = 'max_depth', y = 'log_loss_test', ax = ax,
        hue = 'min_samples_split', marker = 'o'
    )
    force_ax_grid(ax, x_seperators=5)
    ax.set(ylabel = 'LogLoss', xlabel = 'Max Depth')
    
    df_scan_model_b.sort_values('log_loss_test').head(10).reset_index(drop=True)

### See Test vs Train

In [None]:
if do_tuning:
    ax = get_ax(nrows=1, width=10, height=4)
    data = df_scan_model_b.loc[lambda df: df['max_depth']==10]
    plot_validation_and_test(data, x='min_samples_split_str', ax=ax)
    ax.set(ylabel = 'Log Loss', xlabel = 'Min Size for Split')
    
    ax = get_ax(nrows=1, width=10, height=4)
    data = df_scan_model_b.loc[lambda df: df['min_samples_split']==100]
    plot_validation_and_test(data, x='max_depth', ax=ax, x_seperators=5)
    ax.set(ylabel = 'Log Loss', xlabel = 'Max Depth')

## B - Build Model

In [None]:
n_estimators_model_b = 100
min_samples_split_choices_model_b = 50
depth_model_b = 10

In [None]:
model_b = RandomForestClassifier(n_estimators =n_estimators_model_b, max_depth= depth_model_b, min_samples_split=min_samples_split_choices_model_b)
model_b.fit(X_model_b, y_model_b)

## B - Score

In [None]:
probs_train = get_probabilites(model_b, X_model_b)
log_loss_train = metrics.log_loss(y_model_b, probs_train)
print(log_loss_train)

In [None]:
gkf = model_selection.GroupKFold(5)
probs_test = model_selection.cross_val_predict(model_b, X_model_b, y_model_b, cv=gkf, groups=groups, method='predict_proba')[:,1]
log_loss_test = metrics.log_loss(y_model_b, probs_test)

print(f'''
mean_train_score_model_b      = {log_loss_train:,.4f}
mean_test_score_model_b       = {log_loss_test:,.4f}
''')

## B - Predict

In [None]:
df_training_top_ranks_tmp = df_training.loc[lambda df: df['score_rank'] <= 50]
df_training_not_top_ranks_tmp = df_training.loc[lambda df: df['score_rank'] > 50]

df_training_top_ranks_tmp['model_b_output_prob'] = probs_test

df_training_not_top_ranks_tmp['model_b_output_prob'] = get_probabilites(
    model_b,
    df_training_not_top_ranks_tmp[list_model_feature_cols_model_b]
)

df_training = pd.concat([
    df_training_top_ranks_tmp,
    df_training_not_top_ranks_tmp,
])

## B - Features Importance

In [None]:
import time

start_time = time.time()
importances = model_b.feature_importances_
std = np.std([
    tree.feature_importances_ for tree in model_b.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: "
      f"{elapsed_time:.3f} seconds")

dict_forest_importances = {
    'feature': list_model_feature_cols_model_b,
    'importance': importances,
    'std': std
}
df_forest_importances = pd.DataFrame(dict_forest_importances)


ax = get_ax(width=4)

(
    df_forest_importances
    .sort_values('importance')
    .plot(
        kind='barh',  ax=ax,
        x='feature', y= 'importance', xerr='std'
    )
)

ax.set(title="Feature importances using MDI", xlabel="Mean decrease in impurity",ylabel='')
force_ax_grid(ax)

## B - Rank and Accuracy

In [None]:
df_training = (
    df_training
    .assign(
        rank_model_b=lambda df: df.groupby('ref_no')['model_b_output_prob'].rank('first', ascending=False).astype(int),
    )
)

df_top_predictions_model_b = df_training.loc[lambda df: df['rank_model_b'] == 1]

In [None]:
display_performance_metrics_df(dict_no_model_perf)
dict_model_b = get_performance_metrics_df(df_training, prob_col='model_b_output_prob', label=label, rank_col='rank_model_b')
display_performance_metrics_df(dict_model_b)

## B - Calibration

In [None]:
do_calibration(df_training, prob_col='model_b_output_prob', rank_col='rank_model_b')

## B - Plot Probs

In [None]:
plot_probs(df_training, prob_col='model_b_output_prob', rank_col='rank_model_b')

## B - Print Out Scenarios

In [None]:
print_out_scenarios_two_cuts(df_top_predictions_model_b, 'model_b_output_prob', SCORE_NO_QA, SCORE_LOW_QA)

# Summary of perf

## Print Outs

In [None]:
print('No Model')
display_performance_metrics_df(dict_no_model_perf)
print('Model B')
display_performance_metrics_df(dict_model_b)

## Plot Recall

In [None]:
def plot_recall(data, dict_rank_models, max_match_rank=1, xrange=50):

    n_models = len(dict_rank_models)
    bins = np.linspace(1, xrange, (xrange + 1))

    ax = get_ax()
    data_matches = data.loc[lambda df: df['match_rank'] <= max_match_rank]

    for i, (model_name, model_rank_col) in enumerate(dict_rank_models.items()):
       
        sns.histplot(
            data_matches,
            x=model_rank_col,
            bins=bins,
            label=model_name,
            ax=ax,
            cumulative=True,
            stat='percent',
            fill=False,
            element='step',
            color=sns_colours[i]
        )

    force_ax_grid(ax)
    ax.legend()
    ax.set(title='What % ')
    
    return ax
        

In [None]:
dict_rank_models = {
    'Baseline': 'score_rank',
    'Model B': 'rank_model_b',
}

In [None]:
ax = plot_recall(df_training, dict_rank_models, max_match_rank=1, xrange=50)

## Precision-Recall Curves

### PR Functions

In [None]:
def add_precision_recall_curve_to_ax(data, y_col, prob_col, model_name, ax):

    probs = data[prob_col]
    y = data[y_col]
    
    precision, recall, thresholds = metrics.precision_recall_curve(y, probs)
    
    thresholds = list(thresholds)
    thresholds.append(1)
    
    pr_display = metrics.PrecisionRecallDisplay(
        precision=precision, recall=recall,
        estimator_name=model_name
    )
    
    pr_display.plot(ax)
    
    
    df = pd.DataFrame({
        'precision': precision,
        'recall': recall,
        'thresholds': thresholds,
    })
        
    return ax, df

In [None]:
def get_closest(list_values, find_value):
    
    closest = min(list_values, key=lambda x: abs(x-find_value))
    return closest

In [None]:
def add_precision_recall_annotation(ax, df_pr, threshold_to_annotate):
    
    threshold_annotate = get_closest(df_pr['thresholds'], threshold_to_annotate)
    filter_annotate = df_pr['thresholds'] == threshold_annotate
    precision_annotate = df_pr.loc[filter_annotate]['precision'].mean()
    recall_annotate = df_pr.loc[filter_annotate]['recall'].mean()

    annotation = '\n'.join([
        f'P > {threshold_annotate*100:,.0f}%',
        f'Pre. = {precision_annotate*100:,.0f}%',
        f'Rec. = {recall_annotate*100:,.0f}%',
    ])
    
    ax.scatter(x=[recall_annotate], y=[precision_annotate], color='black', zorder=3)
    ax.annotate(
        annotation,
        (recall_annotate, precision_annotate),
        textcoords='offset points',
        xytext=(10, 10),
    )
    

### PR Plot

In [None]:
ax = get_ax(width=6, height=6)

ax, df = add_precision_recall_curve_to_ax(df_training, y_col=label, prob_col='model_b_output_prob', model_name='Model B', ax=ax)

add_precision_recall_annotation(ax, df, 0.5)
add_precision_recall_annotation(ax, df, 0.25)
add_precision_recall_annotation(ax, df, 0.02)

force_ax_grid(ax, y_seperators=0.1, x_seperators=0.1)

## Distribution of Prob

In [None]:
ax_full, ax_fine  = get_ax(ncols=2, width=12)

bins_fine = np.linspace(0, 0.1, 101)

sns.histplot(
    df_training.loc[lambda df: df[label] == False],
    x='model_b_output_prob',
    ax=ax_fine,
    bins=bins_fine,
    stat='percent',
    cumulative=True,
    label='False Matches',
    fill=False,
    element='step',
    color=sns_colours[0],
)

sns.histplot(
    df_training.loc[lambda df: df[label] == True],
    x='model_b_output_prob',
    ax=ax_fine,
    bins=bins_fine,
    stat='percent',
    cumulative=True,
    label='True Matches',
    fill=False,
    element='step',
    color=sns_colours[1],
)

force_ax_grid(ax_fine, x_seperators=0.01, y_seperators=10)
ax_fine.set(xlim=[0, 0.05], ylim=[0, 105], title='Distribution of P < 10%', xlabel='Model Probability')
ax_fine.legend()


bins_full = np.linspace(0, 1, 501)

sns.histplot(
    df_training.loc[lambda df: df[label] == False],
    x='model_b_output_prob',
    ax=ax_full,
    bins=bins_full,
    stat='percent',
    cumulative=True,
    label='False Matches',
    fill=False,
    element='step',
    color=sns_colours[0],
)


sns.histplot(
    df_training.loc[lambda df: df[label] == True],
    x='model_b_output_prob',
    ax=ax_full,
    bins=bins_full,
    stat='percent',
    cumulative=True,
    label='True Matches',
    fill=False,
    element='step',
    color=sns_colours[1],
)


force_ax_grid(ax_full, x_seperators=0.1, y_seperators=10)
ax_full.set(xlim=[0, 1], ylim=[0, 105], title='Distribution of Full Range', xlabel='Model Probability')
ax_full.legend()

## Highest Prob Matches

In [None]:
display_full(
    df_training
    .sort_values('model_b_output_prob', ascending=False)
    .loc[lambda df: df['model_b_output_prob'] > 0.9]
    .T

)

# Export Data

In [None]:
export_cols = [
    'standard_code',
    'ref_no',
    'version',
    'standard_title',
    'soc_2020_ext_code',
    'soc_2020_ext_title',
    'match_rank',
    'soc_job_matches_standard_title',
    'score_soc_job_match_standard_title',
    'standard_typical_job_title',
    'soc_job_matches_typical_job',
    'score_soc_job_match_typical_job',
    'score_overview',
    'standard_overview',
    'soc_2020_ext_description',
    'soc_2020_matches_previous_assignment',
    'soc_2020_major_group',
    'soc_major_group_lower',
    'soc_major_group_upper',
    'level',
    'soc_in_suggested_major_group',
    'soc_2020_ext_is_nec',
    'status',
    'route',
    'is_core_and_options',
    'option_titles',
    'ids_soc_2020_code',
    'ids_soc_2020_rationale',
    'soc_2020_code',
    'soc_2020_title',
    'soc_2010_code',
    'autoassign_is_top',
    'autoassign_in_top_2',
    'autoassign_in_top_3',
    'autoassign_in_top_5',
    'autoassign_is_ranked',
    'st_code_has_match_rank',
    'score_rank',
    'model_b_output_prob',
    'rank_model_b',
]

export_cols = [c for c in export_cols if c in df_training.columns]

In [None]:
df_training_no_labels['model_b_output_prob'] = get_probabilites(
    model_b,
    df_training_no_labels[list_model_feature_cols_model_b]
)

In [None]:
df_output_modelling = (
    pd.concat([
        df_training.assign(st_code_has_match_rank=1),
        df_training_no_labels.assign(st_code_has_match_rank=0),
    ])      
    .sort_values(['ref_no', 'score_rank'])
    [export_cols] 
    .reset_index(drop=True)
)

path_predicion_export= Path('nsfg_data', f'df_output_modelling_{run_version}.csv')
df_output_modelling.to_csv(path_predicion_export, index=False)