# Model Performance

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

In [None]:
FILE_PREFIX = 'GPT_emb256_nh4_nl4'

## Load Data

In [None]:
dataset = 'Alkaline phosphatase PafA'

data_dir  = os.path.join('..', 'in_silico_supervised', 'input', f'{dataset}  (In Silico_ Supervised)')

train_csv = os.path.join(data_dir, 'train.csv')
test_csv  = os.path.join(data_dir, 'test (with values).csv')

In [None]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [None]:
preds_test_df = pd.read_pickle(f"{FILE_PREFIX}_preds_test.pkl")

In [None]:
col_names = list(train_df.columns)
col_names

In [None]:
input_cols = col_names[:2]
target_cols = col_names[2:]

input_cols,target_cols

In [None]:
test_df.head()

In [None]:
preds_test_df.head()

## Merge DataFrames

In [None]:
train_df['dataset'] = 'train'
test_df['dataset'] = 'test'
preds_test_df['dataset'] = 'model'

In [None]:
merge_cols = ['mutated_sequence']+[name for name in target_cols]+['dataset']
merge_cols

In [None]:
merge_df = pd.concat((test_df,preds_test_df))
merge_df = merge_df[merge_cols]
merge_df.head()

## Target Values

In [None]:
def spearman_corr(pred, y):
    # Get ranks of pred and y using double argsort trick
    pred_rank = np.argsort(np.argsort(pred))
    y_rank = np.argsort(np.argsort(y))

    # Convert to float for correlation computation
    pred_rank = pred_rank.astype(float)
    y_rank = y_rank.astype(float)

    # Normalize (zero mean and unit variance)
    pred_rank = (pred_rank - pred_rank.mean()) / pred_rank.std()
    y_rank = (y_rank - y_rank.mean()) / y_rank.std()

    # Pearson correlation of ranks
    return (pred_rank * y_rank).mean()

In [None]:
data = {
    "target": [],
    "spearman_coeff":[]
}

for c in range(len(target_cols)):
    pred_vals = preds_test_df[target_cols[c]].to_numpy()
    target_vals = test_df[target_cols[c]].to_numpy()

    # compute spearman coefficient
    sp = spearman_corr(pred_vals, target_vals)

    data['target'].append(target_cols[c])
    data['spearman_coeff'].append(sp)

spearman_df = pd.DataFrame(data=data)
spearman_df


In [None]:
sns.pairplot(data=merge_df, hue='dataset',vars=target_cols)
plt.show()