# Notebook Summary

## Introduction

- Process raw training data to have more features


## What is in this notebook

- Read in raw data from `01_create_training_data_and_apply_w2v_comparisons.ipynb`
   - The first part of the version number referes to which output to read.
- Process features
   - Analyse level
   - Delta features (score compared to best alternative)
- Plot the features for common sense
- Output to build models on    
      
## Output/Results

- File in format `f'nsfg_data\df_train_data_nlp__{run_version}.csv'`
 - This contains data for next stage




# Notebook Setup


## Version and Data Name

In [None]:
run_version = 'v4.2'
version_in = run_version.split('.')[0]

In [None]:
filename_train_data = f'df_train_data_nlp__{version_in}.csv'

## Regular Imports

In [None]:
### Imports 
import os
import sys
from pathlib import Path

import numpy as np
import math
from scipy import stats

import pandas as pd
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [None]:
def run_ls_on_path(path):
    """
    Run ls on a path in jupyter and display to notebook
    Can't be imported as uses cell magic
    Args: path (pathlib.WindowsPath): path created by pathlib
    """
    userhome = os.path.expanduser("~")
    reformatted_path = ('\"' + str(path).replace('\\\\', '\"/\"') + '\"').replace('\"~\"','~').replace('~', userhome)
    print(f'$ ls {path}')
    !ls {reformatted_path}
    print('\n')

## Plotting Setup

In [None]:
### Colours
blue =  '#79a5f7'
red  =  '#ff9696'
green=  '#9ebd9e'
sns_colours = sns.color_palette()

### Make the plots a nice size
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'large',
          'axes.labelsize': 'large',
          'axes.titlesize':'large',
          'xtick.labelsize':'large',
          'ytick.labelsize':'large',
          'figure.titlesize':'x-large'}
pylab.rcParams.update(params)

## My Imports

In [None]:
# Add Path of where you have imported my functions
current_path = os.getcwd()
functions_path = Path('..', 'Functions')
sys.path.append(str(functions_path))

In [None]:
## Pandas functions
import laurie_pandas_functions as pd_funcs
from laurie_pandas_functions import display_full

## Matplotlib funcs
import laurie_plotting_functions as plot_funcs
from laurie_plotting_functions import get_ax, force_ax_grid

In [None]:
## Useful when developing your functions
from importlib import reload  
reload(pd_funcs)
reload(plot_funcs)

# Get Data
Read the csvs from the paths and show the top two rows

In [None]:
path_data = Path('nsfg_data')

In [None]:
## Path to sharepoint directory (created using Add shortcut to OneDrive button)
path_hackathon_project = Path(
 '~', 'OneDrive - Department for Education', 'Documents - DSDST', 'General', 'Hackathon', '2022 Spring Hackathon', 'Project 1 SOC Assignment'
)

In [None]:
## Features data Data
df_train_raw = pd.read_csv(f'{path_data}/{filename_train_data}')

In [None]:
df_train_raw['standard_code'].nunique()

## Add Rank

In [None]:
df_train_cols_processed = (
    df_train_raw
    .drop_duplicates()
    .assign(ref_no=lambda df: df['standard_code'] + '_V' + df['version'].astype(str))
    .assign(
        score_rank_dense=lambda df: df.groupby('ref_no')['score'].rank('dense', ascending=False).astype(int),
        score_rank=lambda df: df.groupby('ref_no')['score'].rank('first', ascending=False).astype(int),
    )
    .rename(columns={'soc2020_code': 'soc2020_code_assignment'})
)

## Filter out ones with multiple rank 1s
- These typically have failed at the nlp stage for some reason

In [None]:
df_n_rows_per_id = pd_funcs.count_n_rows_per_id(df_train_cols_processed.loc[lambda df: df['score_rank_dense']==1], 'ref_no')

In [None]:
df_outliers = df_n_rows_per_id.loc[lambda df: df['n_rows_per_id'] > 300]
outlier_std_refs = list(df_outliers['ref_no'].unique())
df_train_filtered = df_train_cols_processed.loc[lambda df: ~df['ref_no'].isin(outlier_std_refs)]

## Merge Labels and Features

### Add columns

In [None]:
df_train_labelled = (
    df_train_filtered
    .assign(
        autoassign_is_top=lambda df: df['match_rank'] == 1,
        autoassign_in_top_2=lambda df: df['match_rank'].isin([1, 2]),
        autoassign_in_top_3=lambda df: df['match_rank'].isin([1, 2, 3]),
        autoassign_in_top_5=lambda df: df['match_rank'].isin([1, 2, 3, 4, 5]),
        autoassign_is_ranked=lambda df: ~df['match_rank'].isna(),
    )
)

label = 'autoassign_in_top_3'

# Add Features

## Level Features

In [None]:
# Get Param's from Jody's file
df_level_params = pd.read_csv('data/soc_group_level_parameters.csv')

In [None]:
## Merge it on
df_train_w_features = (
    df_train_labelled
    .merge(df_level_params, on='level', how='left')
    .assign(
        soc_in_suggested_major_group = lambda df: ((df['soc_2020_major_group'] >= df['soc_major_group_lower']) & (df['soc_2020_major_group'] <= df['soc_major_group_upper'])).astype(int),
        relative_distance_between_level_and_soc_major_group = lambda df:(df['soc_2020_major_group'] - df['soc_major_mean_train']) / df['soc_major_std_dev_train'],
        absolute_relative_distance_between_level_and_soc_major_group = lambda df: np.abs(df['relative_distance_between_level_and_soc_major_group']),
    )
)

## Is NEC

In [None]:
df_train_w_features['soc_2020_ext_is_nec'] = (
    df_train_w_features['soc_2020_ext_title'].str.lower().str.contains('n.e.c.').fillna(0).astype(int)
)

## Delta Features

### Write Function

In [None]:
def add_delta_feature(data, col, gb_key='ref_no', rank_col='score_rank'):
    """
    For each column add two columns
    - Add a column which is value of col - (value of col for next highest ranked row that is not the same row)
    - Add a column which is value of col - (value of col for the next highest ranked row after that)
    """
    
    ## Create table that has values for rank 1, 2 and 3 per gb_key.
    df_alternatives = (
        data
        .loc[lambda df: df[rank_col] <= 3]
        .pipe(pd.pivot_table, columns=rank_col, index=gb_key, values=col)
        .reset_index()
        .rename(columns={1: f'{col}_for_rank_1', 2: f'{col}_for_rank_2', 3: f'{col}_for_rank_3'}
        
        )
    )

    
    df_out =  (
        # Join alternatives on to main
        data
        .merge(df_alternatives, how='left', on=gb_key)
        ## Calculate value - 1st and 2nd highest ranking alternatives
        .assign(**{
            f'{col}_minus_alt_1': lambda df: df[col] - np.where(df[rank_col] > 1, df[f'{col}_for_rank_1'], df[f'{col}_for_rank_2']),
                                                                                               
            f'{col}_minus_alt_2': lambda df: df[col] - np.where(df[rank_col] > 2, df[f'{col}_for_rank_2'], df[f'{col}_for_rank_3']),
        }
        )
        # Drop alternative values
        .drop([f'{col}_for_rank_1', f'{col}_for_rank_2', f'{col}_for_rank_3'], axis=1)
    )
    
    return df_out

### Test Function

In [None]:
from pandas._testing import assert_frame_equal

In [None]:
df_input = pd.DataFrame({
    'ref_no': ['ST0001_V1.0', 'ST0001_V1.0', 'ST0001_V1.0', 'ST0002_V1.0', 'ST0002_V1.0', 'ST0002_V1.0'],
    'score_soc_job_match_standard_title': [
        0.5, 
        0.2, 
        0.1,
        0.4, 
        0.6, 
        0.5,
    ],
}).assign(
    score_rank=lambda df: df.groupby('ref_no')['score_soc_job_match_standard_title'].rank('first', ascending=False).astype(int),
)


df_expected = df_input.copy()

df_expected['score_soc_job_match_standard_title_minus_alt_1'] = [
            0.5 - 0.2,
            0.2 - 0.5,
            0.1 - 0.5,
            0.4 - 0.6,
            0.6 - 0.5,
            0.5 - 0.6,
]

df_expected['score_soc_job_match_standard_title_minus_alt_2'] = [
            0.5 - 0.1,
            0.2 - 0.1,
            0.1 - 0.2,
            0.4 - 0.5,
            0.6 - 0.4,
            0.5 - 0.4,
]


df_result = add_delta_feature(df_input, 'score_soc_job_match_standard_title')

assert_frame_equal(df_result, df_expected, check_like=True)

display(df_result.style.set_table_styles(pd_funcs.get_lauries_table_styles()))

### Run Function

In [None]:
df_train_w_delta_features = df_train_w_features.copy()

for col in [ 'score_soc_job_match_standard_title', 'score_soc_job_match_typical_job', 'score_overview', 'soc_2020_matches_previous_assignment', 'soc_in_suggested_major_group', 'absolute_relative_distance_between_level_and_soc_major_group']:
    df_train_w_delta_features = add_delta_feature(df_train_w_delta_features, col=col, gb_key='ref_no', rank_col = 'score_rank')

In [None]:
df_train_w_delta_features.head(1).T

### Store Final Version

In [None]:
df_train = df_train_w_delta_features

# Analysis

## Current Accuracy

In [None]:
## Top Choice
df_train_original_best = (
    df_train
    .loc[lambda df: df['score_rank']==1]
)

pd_funcs.agg_df_by_cols(df_train_original_best, 'autoassign_in_top_3', display_df=True, do_total=False)

## Which Rank Do We Need

In [None]:
## Check Truth
df_train_score_rank = pd_funcs.agg_df_by_cols(
    df_train
    .loc[lambda df: df[label]==True],
    'score_rank',
    return_df=True,
    do_total=False,
    sort_by_cols=True
)

df_train_score_rank['cumu_pct_rows'] = df_train_score_rank['pct_rows'].cumsum()

(
    df_train_score_rank
    .set_index('score_rank')
    .style
    .set_table_styles(pd_funcs.get_lauries_table_styles())
    .format({
        'pct_rows': '{:,.1f}%',
        'cumu_pct_rows': '{:,.1f}%',
    }
    )
)

# Plot Features

In [None]:
df_train.head(3).T

In [None]:
features = [
    'score_soc_job_match_standard_title',
    'score_soc_job_match_standard_title_minus_alt_1',
    'score_soc_job_match_standard_title_minus_alt_2',
    'score_soc_job_match_typical_job',
    'score_soc_job_match_typical_job_minus_alt_1',
    'score_soc_job_match_typical_job_minus_alt_2',
    'score_overview',
    'score_overview_minus_alt_1',
    'score_overview_minus_alt_2',
    'soc_2020_matches_previous_assignment',
    'soc_2020_matches_previous_assignment_minus_alt_1',
    'soc_2020_matches_previous_assignment_minus_alt_2',
    'soc_in_suggested_major_group',
    'soc_in_suggested_major_group_minus_alt_1',
    'soc_in_suggested_major_group_minus_alt_2',
    'soc_2020_ext_is_nec',
    'is_core_and_options',
]


data = df_train

for col in features:
    
    ax_hist, ax_bar = get_ax(ncols=2, width=15)
    
    sns.histplot(
        data = data,
        x = col, hue = label,
        ax = ax_hist, stat = 'probability', common_norm = False, bins=101
    )
    
    sns.barplot(
        data = data,
        x = col, 
        y = label,
        ax = ax_bar,
        orient = 'h'
    )

    plot_funcs.annotate_hbar_ax(ax_bar, annotate_format='{:,.2f}')
        
    force_ax_grid([ax_hist, ax_bar])

In [None]:
data = (
    df_train
    .loc[lambda df: df['score_rank'] <= 50]
)

for col in features:
    
    ax_hist, ax_bar = get_ax(ncols=2, width=15)
    
    sns.histplot(
        data = data,
        x = col, hue = label,
        ax = ax_hist, stat = 'probability', common_norm = False, bins=101
    )
    
    sns.barplot(
        data = data,
        x = col, 
        y = label,
        ax = ax_bar,
        orient = 'h'
    )

    plot_funcs.annotate_hbar_ax(ax_bar, annotate_format='{:,.2f}')

    
    
    force_ax_grid([ax_hist, ax_bar])

In [None]:
data = (
    df_train
    .loc[lambda df: df['score_rank'] <= 10]
)

for col in [
    'absolute_relative_distance_between_level_and_soc_major_group',
    'absolute_relative_distance_between_level_and_soc_major_group_minus_alt_1',
    'absolute_relative_distance_between_level_and_soc_major_group_minus_alt_2',        
]:
    
    ax_hist, ax_bar = get_ax(ncols=2, width=15)
    
    end = 5
    start = 0
    if '_alt_' in col: start = -end
    bins = np.linspace(start, end, 51)
    
    sns.histplot(
        data = data,
        x = col, hue = label,
        ax = ax_hist, stat = 'percent', common_norm = False,
        bins=bins,
    )
    
    sns.barplot(
        data = data,
        x = col, 
        y = label,
        ax = ax_bar,
        orient = 'h'
    )

    plot_funcs.annotate_hbar_ax(ax_bar, annotate_format='{:,.2f}')

        
    force_ax_grid([ax_hist, ax_bar])

# Export Features with Labels

In [None]:
df_train.head(10).T

In [None]:
df_train.to_csv(f'{path_data}/df_train_w_features__{run_version}.csv', index=False)