In [1]:
import sys
import os
analysis_dir_path = '../'
sys.path.append(analysis_dir_path)

import models
import pandas as pd
import warnings
warnings.filterwarnings('ignore')





In [2]:
# filters = 32
# kernels = 5
# units = 128

In [3]:
# CNN = f'CNN_{filters}_filters_{kernels}_kernels_predictions'
# CNNA = f'CNN_Attention_{filters}_filters_{kernels}_kernels_predictions'
# RNN = f'RNN_{units}_units_predictions'
# LSTM = f'LSTM_{units}_layers_predictions'

# name_maps = {
#     CNN: "CNN",
#     CNNA: "CNN With Attention",
#     RNN: "RNN",
#     LSTM: "LSTM"
# }

In [4]:
def format_model_name(row):
    name = row['Name']
    params = row['Params']
    if 'LSTM' in name:
        units = params.split(' ')[0]
        return f'LSTM_{units}_layers_predictions'
    elif 'CNN' in name:
        parts = params.split(',')
        filters = parts[0].split(' ')[0]
        kernel_size = parts[1].split(' ')[3]
        return f'CNN_Attention_{filters}_filters_{kernel_size}_kernels_predictions'
    elif 'RNN' in name:
        units = params.split(' ')[0]
        return f'RNN_{units}_units_predictions'

In [5]:
def model_select(csv_filepath):
    df = pd.read_csv(csv_filepath, nrows=32)

    # Remove rows where any of the precision or recall for 0 or 1 is 0 or 1
    df['MinPrecisionRequirement'] = df['Prior'] * 2
    filtered_df = df[(df[['Precision (0)', 'Recall (0)', 'Precision (1)', 'Recall (1)']] != 0).all(axis=1) &
                     (df[['Precision (0)', 'Recall (0)', 'Precision (1)', 'Recall (1)']] != 1).all(axis=1) &
                     (df['Precision (1)'] >= df['MinPrecisionRequirement'])]

    # Find models based on the criteria
    highest_f1_idx = filtered_df['F1 (1)'].idxmax()
    highest_recall_idx = filtered_df[filtered_df['Recall (1)'] < 1]['Recall (1)'].idxmax()
    highest_precision_idx = filtered_df['Precision (1)'].idxmax()
    lowest_f1_idx = filtered_df['F1 (1)'].idxmin()

    highest_f1 = filtered_df.loc[highest_f1_idx]
    highest_recall = filtered_df.loc[highest_recall_idx]
    highest_precision = filtered_df.loc[highest_precision_idx]
    lowest_f1 = filtered_df.loc[lowest_f1_idx]

    # Print the selected metric values
    print(f"Highest F1: {highest_f1['F1 (1)']} at model index {highest_f1_idx}")
    print(f"Highest Recall: {highest_recall['Recall (1)']} at model index {highest_recall_idx}")
    print(f"Highest Precision: {highest_precision['Precision (1)']} at model index {highest_precision_idx}")
    print(f"Lowest F1: {lowest_f1['F1 (1)']} at model index {lowest_f1_idx}")

    # Create the list of formatted model names
    models = [
        format_model_name(highest_f1),
        format_model_name(highest_recall),
        format_model_name(highest_precision),
        format_model_name(lowest_f1)
    ]

    return models

In [6]:
def calculate_metrics(df, rule_result_column):
    TP = ((df[rule_result_column] == 1) & (df['corr'] == 1)).sum()
    FP = ((df[rule_result_column] == 1) & (df['corr'] == 0)).sum()
    FN = ((df[rule_result_column] == 0) & (df['corr'] == 1)).sum()
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return f1, recall, precision

In [7]:
%%capture
import example

# COMMODITY = 'nickel_no_val_20'
# MODEL = CNNA
# RULE_NUM = 10
# confidence_levels = [0.4, 0.45, 0.5, 0.55, 0.6, 0.7, 0.8, 0.9, 0.95]

def exclude_zero(series):
    temp = series.copy()
    temp = temp[temp != 0]
    temp = temp[temp != 1]
    return temp

# A function for providing useful labels for the results
def label(results, row):
    labels = []
    max_base_model_precision = exclude_zero(results["Precision (Base Model)"]).max()
    max_base_model_recall = exclude_zero(results["Recall (Base Model)"]).max()
    max_base_model_f1 = exclude_zero(results["F1 (Base Model)"]).max()
    min_base_model_f1 = exclude_zero(results["F1 (Base Model)"]).min()

    if max_base_model_precision == row["Precision (Base Model)"]: labels.append("Best Precision")
    if max_base_model_recall == row["Recall (Base Model)"]: labels.append("Best Recall")
    if max_base_model_f1 == row["F1 (Base Model)"]: labels.append("Best F1")
    if min_base_model_f1 == row["F1 (Base Model)"]: labels.append("Worst F1")
    return ', '.join(labels)

def evaluate_df(df, properties={}):
  f1, recall, precision = calculate_metrics(df, "pred")
  prior = df["corr"].sum() / len(df)
  df.to_numpy().dump('data/test.npy')
  df = example.run_edcr()

  new_precision = df.iloc[50]["pre"]
  new_recall = df.iloc[50]["recall"]
  new_f1 = df.iloc[50]["F1"]

  percent_precision = (new_precision - precision) / precision
  percent_recall = (new_recall - recall) / recall
  percent_f1 = (new_f1 - f1) / f1

  return {
    **properties,
    "Precision (Base Model)": precision,
    "Recall (Base Model)": recall,
    "F1 (Base Model)": f1,
    "Prior": prior,
    " ": "",
    "Precision (EDCR)": df.iloc[50]["pre"],
    "Recall (EDCR)": df.iloc[50]["recall"],
    "F1 (EDCR)": df.iloc[50]["F1"],
    "  ": "",
    "Precision Improvement": df.iloc[50]["pre"] - precision,
    "Recall Improvement": df.iloc[50]["recall"] - recall,
    "F1 Improvement": df.iloc[50]["F1"] - f1,
    "  ": "",
    "Precision Improvement (%)": percent_precision,
    "Recall Improvement (%)": percent_recall,
    "F1 Improvement (%)": percent_f1
    
  }

for COMMODITY in [
  # 'cobalt_20'
  # 'cobalt_20', 'cobalt_no_val_20', 'cobalt_shift_20', 'cobalt_streaming_20', 'cobalt_20',
  # 'copper_20', 'copper_no_val_20', 'copper_shift_20', 'copper_streaming_20', 'copper_20',
  # 'magnesium_20', 'magnesium_no_val_20', 'magnesium_shift_20', 'magnesium_streaming_20', 'magnesium_20',
  # 'nickel_20', 'nickel_no_val_20', 'nickel_shift_20', 'nickel_streaming_20', 'nickel_20'
  # 'cobalt_20', 'copper_20', 'magnesium_20', 'nickel_20', 
  'cobalt_shift_20', 'copper_shift_20', 'magnesium_shift_20', 'nickel_shift_20',

]:
  results = []
  MODELS = model_select(f'../{COMMODITY}/test/results_test.csv')
  print('hey')
  print(f'({COMMODITY}): {MODELS}')
  for MODEL in MODELS:
  # for MODEL in [CNN, CNNA, RNN, LSTM]:
  # for MODEL in [CNN]:
    for ALGO in ['correction', 'detection_correction']:
    # for ALGO in ['correction']:
      for RULE_NUM in [5, 10, 15, 20, 50, 100]:
      # for RULE_NUM in [5]:
        df = models.npy_to_top_n_f1_bowpy(f'../{COMMODITY}/test/predictions/test/{MODEL}.csv', f'../{COMMODITY}/test/predictions/test_F1_{ALGO}', RULE_NUM)
        results.append(evaluate_df(df, properties={"Model": MODEL, "Algorithm": ALGO, "Rule Num": RULE_NUM}))

  results = pd.DataFrame(results)
  results['Label'] = results.apply(lambda x: label(results, x), axis=1)
  results.to_excel(f'out/top_f1/{COMMODITY}_results.xlsx', index=False)

In [8]:
%%capture
import example

# COMMODITY = 'nickel_no_val_20'
# MODEL = CNNA
# RULE_NUM = 10
# confidence_levels = [0.4, 0.45, 0.5, 0.55, 0.6, 0.7, 0.8, 0.9, 0.95]

def exclude_zero(series):
    temp = series.copy()
    temp = temp[temp != 0]
    temp = temp[temp != 1]
    return temp

# A function for providing useful labels for the results
def label(results, row):
    labels = []
    max_base_model_precision = exclude_zero(results["Precision (Base Model)"]).max()
    max_base_model_recall = exclude_zero(results["Recall (Base Model)"]).max()
    max_base_model_f1 = exclude_zero(results["F1 (Base Model)"]).max()
    min_base_model_f1 = exclude_zero(results["F1 (Base Model)"]).min()

    if max_base_model_precision == row["Precision (Base Model)"]: labels.append("Best Precision")
    if max_base_model_recall == row["Recall (Base Model)"]: labels.append("Best Recall")
    if max_base_model_f1 == row["F1 (Base Model)"]: labels.append("Best F1")
    if min_base_model_f1 == row["F1 (Base Model)"]: labels.append("Worst F1")
    return ', '.join(labels)

for COMMODITY in [
  # 'cobalt_20'
  # 'cobalt_20', 'cobalt_no_val_20', 'cobalt_shift_20', 'cobalt_streaming_20', 'cobalt_20',
  # 'copper_20', 'copper_no_val_20', 'copper_shift_20', 'copper_streaming_20', 'copper_20',
  # 'magnesium_20', 'magnesium_no_val_20', 'magnesium_shift_20', 'magnesium_streaming_20', 'magnesium_20',
  # 'nickel_20', 'nickel_no_val_20', 'nickel_shift_20', 'nickel_streaming_20', 'nickel_20'
  # 'cobalt_20', 'copper_20', 'magnesium_20', 'nickel_20', 
  'cobalt_shift_20', 'copper_shift_20', 'magnesium_shift_20', 'nickel_shift_20',
]:
  results = []
  MODELS = model_select(f'../{COMMODITY}/test/results_test.csv')
  print(f'({COMMODITY}): {MODELS}')
  for MODEL in MODELS:
  # for MODEL in [CNN, CNNA, RNN, LSTM]:
  # for MODEL in [CNN]:
    for ALGO in ['correction', 'detection_correction']:
    # for ALGO in ['correction']:
      for THRESHOLD in [0.1, 0.15, 0.2, 0.25, 0.30, 0.35, 0.4, 0.45, 0.5]:
      # for RULE_NUM in [5]:
        df = models.npy_to_threshold_f1_bowpy(f'../{COMMODITY}/test/predictions/test/{MODEL}.csv', f'../{COMMODITY}/test/predictions/test_F1_{ALGO}', THRESHOLD, exclude_models=[])
        results.append(evaluate_df(df, properties={"Model": MODEL, "Algorithm": ALGO, "Threshold": THRESHOLD}))
  results = pd.DataFrame(results)
  results['Label'] = results.apply(lambda x: label(results, x), axis=1)
  results.to_excel(f'out/threshold/{COMMODITY}_results.xlsx', index=False)