In [86]:
import pandas as pd
import plotly.express as px
import numpy as np

In [38]:
fp_s1 = 'sup_data_s1.xlsx'
fp_s2 = 'sup_data_s2.csv'

In [37]:
s1 = pd.read_excel(fp_s1)

In [61]:
s2 = pd.DataFrame()

rows = []

with open(fp_s2, 'r') as f:
    lines = f.readlines()
    header = lines[0].strip().split(';')
    data_lines = lines[1:]
    for line in data_lines:
        content = line.strip().split(';')
        #print(content)
        for idx in range(0, len(content)):
            if "," in content[idx]:
                # replace , with .
                content[idx] = content[idx].replace(",", ".") + "%"
        # append line to s2 dataframe
        rows.append(content)
s2 = pd.DataFrame(rows, columns=header)
s2 = s2.dropna(how='all').reset_index(drop=True)


In [66]:
s2

Unnamed: 0,﻿Model,fine_tuning,Random_seed,GFP,AAV,GB,Staub,Melt,SubLoc,Dis,SecStr
0,ESM2 8M,full_model,99.0,68.6%,82.5%,89.1%,77.3%,58.1%,54.5%,71.0%,76.2%
1,ESM2 8M,full_model,98.0,68.9%,84.9%,88.2%,74.6%,58.5%,57.1%,71.6%,76.2%
2,ESM2 8M,full_model,97.0,69.1%,80.6%,87.5%,77.8%,58.8%,55.9%,71.8%,76.0%
3,ESM2 35M,full_model,99.0,69.2%,77.6%,87.4%,76.6%,58.2%,59.0%,72.7%,79.1%
4,ESM2 35M,full_model,98.0,68.7%,81.8%,88.1%,77.4%,59.0%,56.5%,74.3%,79.1%
5,ESM2 35M,full_model,97.0,68.9%,83.1%,88.2%,76.7%,62.3%,58.8%,73.1%,79.2%
6,ESM2 150M,full_model,99.0,69.3%,84.8%,87.8%,74.0%,63.8%,60.6%,74.8%,82.7%
7,ESM2 150M,full_model,98.0,68.9%,70.1%,87.8%,74.1%,62.6%,61.4%,72.9%,82.9%
8,ESM2 150M,full_model,97.0,69.0%,84.8%,88.1%,62.6%,61.5%,62.7%,74.4%,82.8%
9,ESM2 150M,full_model,96.0,-,84.2%,-,-,-,-,-,-


In [74]:
# merge s1 and s2. 

s1['fine_tuning'] = "embedding"
s2.columns = s2.columns.str.replace('\ufeff', '', regex=False)
s1 = s1[s2.columns]
merged_df = pd.concat([s1, s2], ignore_index=True)

In [77]:
merged_df.head()

Unnamed: 0,Model,fine_tuning,Random_seed,GFP,AAV,GB,Staub,Melt,SubLoc,Dis,SecStr
0,ESM2 8M,embedding,99,63.5%,68.2%,82.6%,71.6%,56.1%,52.0%,70.0%,75.3%
1,ESM2 8M,embedding,98,63.9%,66.5%,81.4%,75.6%,57.1%,53.1%,70.1%,75.2%
2,ESM2 8M,embedding,97,63.8%,66.3%,81.7%,75.1%,56.8%,51.6%,69.3%,75.2%
3,ESM2 8M,embedding,96,64.1%,66.4%,81.9%,74.8%,55.9%,52.2%,69.7%,75.3%
4,ESM2 8M,embedding,95,64.1%,66.9%,81.4%,78.0%,56.2%,51.2%,69.6%,75.2%


In [89]:
# Convert the relevant columns to integers
cols_to_convert = ["GFP", "AAV", "GB", "Staub", "Melt", "SubLoc", "Dis", "SecStr"]

# Loop over the columns and clean the data
for col in cols_to_convert:
    merged_df[col] = merged_df[col].replace(['-', ''], np.nan)
    merged_df[col] = merged_df[col].str.replace('%', '').astype(float)  # Remove '%' and convert to float
    merged_df[col] = merged_df[col].where(merged_df[col].notna(), np.nan)  # Convert the float to int
    merged_df[col] = merged_df[col].fillna(np.nan)
    merged_df[col] = merged_df[col].apply(lambda x: int(x) if pd.notna(x) else np.nan)


In [90]:
merged_df.head()

Unnamed: 0,Model,fine_tuning,Random_seed,GFP,AAV,GB,Staub,Melt,SubLoc,Dis,SecStr
0,ESM2 8M,embedding,99,63.0,68.0,82.0,71.0,56.0,52.0,70.0,75.0
1,ESM2 8M,embedding,98,63.0,66.0,81.0,75.0,57.0,53.0,70.0,75.0
2,ESM2 8M,embedding,97,63.0,66.0,81.0,75.0,56.0,51.0,69.0,75.0
3,ESM2 8M,embedding,96,64.0,66.0,81.0,74.0,55.0,52.0,69.0,75.0
4,ESM2 8M,embedding,95,64.0,66.0,81.0,78.0,56.0,51.0,69.0,75.0


### Variant specific

In [128]:
def plot_performance_per_feature(df, feature):
    embedding_mean_values = merged_df[merged_df['fine_tuning'] == 'embedding'].groupby('Model')[feature].mean()
    sorted_models = embedding_mean_values.sort_values().index

    fig = px.box(
        merged_df,
        x="Model",
        y=feature,
        color = 'fine_tuning',
        color_discrete_sequence=['darkblue',  'lightblue', 'red']
    )
    fig.update_layout(xaxis={'categoryorder': 'array', 'categoryarray': sorted_models},
        template="plotly_white", font={'family':'Arial', 'color':'#303496'},
    )

    fig.show()

In [129]:
plot_performance_per_feature(merged_df, 'GFP')

In [131]:
plot_performance_per_feature(merged_df, 'AAV')

In [132]:
plot_performance_per_feature(merged_df, 'GB')

In [137]:
plot_performance_per_feature(merged_df, 'Melt')

In [136]:
plot_performance_per_feature(merged_df, 'SubLoc')

In [135]:
plot_performance_per_feature(merged_df, 'Dis')

In [133]:
plot_performance_per_feature(merged_df, 'SecStr')