# Import Packages

In [None]:
import numpy as np
import pandas as pd
import boto3
import io
from sklearn import preprocessing
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import model_selection
import shap

np.random.seed(55)

# Define Functions

In [None]:

def create_column_lists(df, drop_columns):
    """Function to create column lists and return a dictionary of columns.
    A dictionary is a list of what are know as key-value pairs formatted like
    this {'key':value}
    
    Returns the following columms:
        'all_columns', 'keep_columns', 'date_columns_keep',
        'drop_columns', 'cat_columns', 'numeric_columns'
    
    Args:
        df (pd.DataFrame): the data frame to find columns
        drop_columns (list): list of column names to be excluded from future analysis
        
    Returns:
        dict: dictionary of column name types
    """
    # -- all_columns is a list of all the column names in the dateframe
    all_columns = df.columns.tolist()
    
    # -- keep_columns is any column from all_columns that does not feature in
    # -- in the drop columns list
    keep_columns = [c for c in all_columns if c not in drop_columns]
    
    # -- Seperating date columns. These need to be handled differently by the model
    date_columns_keep = [col for col in df[keep_columns].columns if 'DATE' in col]
    
    # -- Make empty list for categoric cols and numeric cols to populate
    catcols = []
    numcols = []
    for colcls in df[keep_columns].columns:
        if df[colcls].dtype == 'O':
            catcols.append(colcls)
        else:
            numcols.append(colcls)
            
    # -- remove unique identifying column from analysis
    if 'unique_id' in numcols:
        numcols.remove('unique_id')
        
    column_dict = {'all_columns': all_columns, 'keep_columns': keep_columns, 'date_columns_keep': date_columns_keep,
                   'drop_columns': drop_columns, 'cat_columns': catcols, 'num_columns': numcols}
    
    return column_dict

In [None]:
def process_date(df, date_columns):
    """
    Transform the date columns to weekday: a number 0-7
    
    Args:
        df (pd.DataFrame): the dataframe to transform
        date_columns (list): list of column names of date columns
        
    Returns:
        pd.DataFrame: the dataframe with date columns tranformed
    """
    # --turning date columns into datetime format for python to read
    df[date_columns] = df[date_columns].apply(lambda x: pd.to_datetime(x, yearfirst=True, infer_datetime_format=True))

    # --convert to dates to weekday for model processing
    for column in date_columns:
        df[column] = df[column].dt.weekday # day_name would return name of day rather than integer
    
    return df

In [None]:
def create_model_input_array_one_hot_encoding(df, catcols, numcols):
    """Function to create the input array for the use in the model.
    This function uses one not encoding on the categorical columns
    and then stacks these with the numerical columns to create the array.
    Also returns the one hot encoder object to use with de-encoding 
    
    Args: 
        df (pd.DataFrame): the dataframe to create array from
        catcols (list): list of categorical columns to encode
        numcols (list): list of numerical columns to encode
        
    Returns:
        np.array, preprocessing.OneHotEncoder(): array to pass to model, the one hot encoder 
    
    """
    # -- OneHot Encoder
    oneHot = preprocessing.OneHotEncoder()
    oneHot.fit(df[catcols])
    transformed = oneHot.transform(df[catcols])

    # -- Convert to array so it can be read by the ML model
    dense_transformed = transformed.todense()
    
    array_to_go_to_model = np.array(np.hstack((dense_transformed, df[numcols].to_numpy())))
    
    return array_to_go_to_model, oneHot
    

In [None]:
def label_encoder_for_columns(df, catcols):
    """Function to use label encoder for columns. 
    The function returns a dictionary of label encoders for 
    all the categorical columns along with the transformed dataframe
    
    Args:
        df (pd.DataFrame): the dataframe to create array from
        catcols (list): list of categorical columns to encode
    
    Returns:
        dict, pd.DataFrame: Dictionary of label encoders
    """
    encoder_dict = {}
    for colcls in catcols:
        le = preprocessing.LabelEncoder()
        le.fit(lnsraw[colcls])
        df[colcls] = le.transform(df[colcls])
        encoder_dict[colcls] = le
        
    return encoder_dict, dfc

In [None]:
# --Set random state IForest

Iso_Forest = IsolationForest(random_state= 55)

# --Create parameter finder for IForest

ParamGrid = {'n_estimators': list(range(100, 800, 5)), 
              'max_samples': list(range(100, 500, 5)), 
              'max_features': [0.1,0.25,0.5,0.75, 0.9,1.00], 
              'bootstrap': [True, False]}

# -- Defining Scorer for GridSearchCV Parameter Finder

def scorer_f(estimator, array_to_go_to_model):
    return np.mean(estimator.score_samples(array_to_go_to_model))

# Read in CSVs including inferring the date format

In [None]:
lnsraw = pd.read_csv('xxxxxxxxxxxxxx/Pseud_lns.csv', 
                     parse_dates=True, 
                     infer_datetime_format=True)

# Pre-processing

#### Filter to only include paid or rejected bills - none that are under assessment. I also am removing any bills that are associated with 0 assessed claim total as there is no risk associated with these.

In [None]:
lnsraw = lnsraw[(lnsraw['I_STATUS'] == 'P') | (lnsraw['I_STATUS'] =='R')]

lnsraw = lnsraw[(lnsraw['AC_TOTALS'] != 0)]

## Use this section to select matter type - be sure to rename the output

In [None]:
lnsraw.groupby('M_NAME').size()

In [None]:
lnsraw = lnsraw[(lnsraw['M_NAME'] == 'SCX')]

# Remove duplicates
lnsraw = lnsraw.drop_duplicates()

#### List of columns to drop

In [None]:
drop_columns = lnsraw[['U_DATE', 'SR_INC','CAT_CODE', 
                      'CAT_NAME', 'M_CODE', 'B_TASK_TYPE', 
                      'D_TASK_TYPE','BS_DATE',
                    'CP_ID', 'SRI_WEEK_COMMENCING', 'DR_DATE', 
                    'B_TYPE', 'DS_DATE', 'BUA_DATE', 
                    'DEC', 'LOC', 'DT_STATUS', 'A_NUM_PSEUD',
                    'REF_NUM_PSEUD', 'CSL_NAME_PSEUD', 'C_NAME_PSEUD',
                    'I_ANUM_PSEUD', 'I_STATUS', 'BT_STATUS',
                    'AC_TOTAL','unique_id' 
                          # Remove M Name if not Analysing all matter types
                          ,'M_NAME'
                         ]]

Please note that LOC was removed due to null values - not sure what these represent. DT_Status was removed as it 
was found that it had no impact on the model.

###really important: M_NAME is there for when we exclude this and feed the model with only one M type

 # Use the create_column_lists function to identify the columns that are of different types.

In [None]:
column_dict = create_column_lists(lnsraw, drop_columns)

lnsraw = process_date(lnsraw, column_dict['date_columns_keep'])


# Remove NA from numerical values before encoding
lnsraw[column_dict['num_columns']] = lnsraw[column_dict['num_columns']].fillna(0)


 Show list of columns going into model:

In [None]:
column_dict['cat_columns'], column_dict['num_columns']

In [None]:
column_dict['keep_columns']

### Encoding

Using the functions from above to encode the columns.

In [None]:
#le_dict,lnsraw =  label_encoder_for_columns(lnsraw, column_dict['catcols'])
#frame_to_go_to_model = lnsraw[column_dict['keep_columns']]

array_to_go_to_model, oneHot = create_model_input_array_one_hot_encoding(lnsraw, column_dict['cat_columns'],
                                                                         column_dict['num_columns'])

# Training the I-Forest Model


### Setting up IForest Model and parameters

In [None]:
param_optimiser = RandomizedSearchCV(IsolationForest(), ParamGrid,scoring=scorer_f, n_iter=80)

### Running the model

In [None]:
IFor_params = param_optimiser.fit(array_to_go_to_model)
print(IFor_params.best_params_) 
# -- first attempt: {'n_estimators': 120, 'max_samples': 285, 'max_features': 0.75, 'bootstrap': True}

In [None]:
IFor = IsolationForest(n_estimators=120, max_samples=285, max_features=1.0, bootstrap=False, contamination=0.1)

In [None]:
IFor.fit(array_to_go_to_model)

In [None]:
y_pred = IFor.predict(array_to_go_to_model)

# Explore the predictions

Creating a dataframe to translate each feature

In [None]:

encoded_columns = oneHot.get_feature_names_out()

model_feature_names = np.concatenate([encoded_columns, np.array(column_dict['num_columns'])])

df_to_model = pd.DataFrame(array_to_go_to_model, columns = model_feature_names)

Using SHAP to explain/evaluate the model

In [None]:


explainer = shap.Explainer(IFor)
shap_values = explainer(df_to_model)

Visualize the first prediction's explanation

In [None]:

shap.summary_plot(shap_values, df_to_model, plot_type="bar")

Translator for Features

In [None]:
list_of_features = oneHot.get_feature_names_out()

pd.DataFrame(list_of_features)


# Put the predictions in to the original dataframe

In [None]:
lnsraw['MODEL_PREDICTION'] = y_pred

lnsraw['SKLEARN_SCORE_ANOMALIES'] = IFor.decision_function(array_to_go_to_model)

lnsraw['ANOMALY_SCORE_ORIG_PAPER'] = [-1*s + 0.5 for s in lnsraw['SKLEARN_SCORE_ANOMALIES']]

In [None]:
lnsraw

# Look at outliers

In [None]:
list(lnsraw[column_dict['keep_columns']])

In [None]:
# lnsraw['IA_NUM_PSEUD'].value_counts(normalize=True)

In [None]:
# lnsraw['I_STATUS'].value_counts(normalize=True)

In [None]:
# lnsraw['I_STATUS'].value_counts(normalize=True)

In [None]:
# lnsraw['ACTIVITY'].value_counts(normalize=True)

In [None]:
lnsraw.to_csv('xxxxxxxxxxxxxxxxxx/mdl.csv', index=False)