In [8]:
import re
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.patches as mpatches

In [2]:
def parse_log(file_path, metric = 'f1_weighted', part = 'tune'):
    parsed = []
    with open(file_path, 'r') as file:
        for line in file:
            matches = re.findall(rf'{part}/{metric}=([\d.e-]+),', line)
            if matches:
                parsed.extend(map(float, matches))
    return parsed

## Check progression

In [3]:
paths = [
    ('experiment/results/inception-random/logs.log.INFO', 'inception_lr0.00001'),
    ('experiment/results/efficientnetb03-random-h1/logs.log.INFO', 'efficientnetb3_lr0.001'),
    ('experiment/results/nasnetlarge-random-h1/logs.log.INFO', 'nasnetlarge_lr0.001'),
    ('experiment/results/resnet152v2-random-h1/logs.log.INFO', 'resnet152v2_random_lr0.001')
]

In [None]:
metric = 'loss'

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

for path, label in paths:
    curve = parse_log(path, metric, part='train')
    ax1.plot(range(len(curve)), curve, label=label)
ax1.set_title('Training')
ax1.legend()
#ax1.set_xlim((0,300))

for path, label in paths:
    curve = parse_log(path, metric, part='tune')
    ax2.plot(range(len(curve)), curve, label=label)
ax2.set_title('Validation')
ax2.legend()

plt.tight_layout()
plt.show()



metric = 'f1_weighted'

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

for path, label in paths:
    curve = parse_log(path, metric, part='train')
    ax1.plot(range(len(curve)), curve, label=label)
ax1.set_title('Training')
ax1.legend()
#ax1.set_xlim((0,300))

for path, label in paths:
    curve = parse_log(path, metric, part='tune')
    ax2.plot(range(len(curve)), curve, label=label)
ax2.set_title('Validation')
ax2.legend()

plt.tight_layout()
plt.show()

## Figure 1

In [20]:
paths = [
    ('experiment/results/inception-random/logs.log.INFO', 'Inception V3'),
    ('experiment/results/efficientnetb03-random-h1/logs.log.INFO', 'EfficientNet-B3'),
    ('experiment/results/nasnetlarge-random-h1/logs.log.INFO', 'NASNet Large'),
    ('experiment/results/resnet152v2-random-h1/logs.log.INFO', 'ResNet-152 V2')
]

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6.6, 3))

for path, label in paths:
    curve = parse_log(path, metric='loss', part='train')
    ax1.plot(range(len(curve)), curve, label=label, linewidth=0.7, alpha=1)
ax1.set_title('Training Loss')
ax1.set_ylabel('Loss')
ax1.set_xlabel('Steps')
ax1.legend()
ax1.set_ylim((0.58,0.7))

for path, label in paths:
    curve = parse_log(path, metric='loss', part='tune')
    ax2.plot(range(len(curve)), curve, label=label, linewidth=0.7, alpha=1)
ax2.set_title('Validation Loss')
ax2.set_ylabel('Loss')
ax2.set_xlabel('Steps')
ax2.legend()
ax2.set_ylim((0.58,0.7))

for i, ax in enumerate((ax1, ax2)):
    ax.text(-0.5, 1.15, f'{chr(65+i)}', transform=ax.transAxes,
            fontsize=16, fontweight='bold', va='top')

plt.tight_layout()
plt.show()

## Figure 2

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 3))

for path, label in paths:
    curve = parse_log(path, metric='f1_weighted', part='tune')
    ax1.plot(range(len(curve)), curve, label=label,linewidth=0.7)
ax1.set_title('Validation F1')
ax1.legend()
ax1.set_ylim((0.85,0.975))
ax1.set_ylabel('F1')
ax1.set_xlabel('Steps')

for path, label in paths:
    curve = parse_log(path, metric='precision_1', part='tune')
    ax2.plot(range(len(curve)), curve, label=label,linewidth=0.7)
ax2.set_title('Validation Precision')
ax2.legend()
ax2.set_ylim((0.85,0.975))
ax2.set_ylabel('Precision')
ax2.set_xlabel('Steps')

for path, label in paths:
    curve = parse_log(path, metric='recall_1', part='tune')
    ax3.plot(range(len(curve)), curve, label=label,linewidth=0.7)
ax3.set_title('Validation Recall')
ax3.legend()
ax3.set_ylim((0.85,0.975))
ax3.set_ylabel('Recall')
ax3.set_xlabel('Steps')

for i, ax in enumerate((ax1, ax2, ax3)):
    ax.text(-0.5, 1.15, f'{chr(65+i)}', transform=ax.transAxes,
            fontsize=16, fontweight='bold', va='top')

plt.tight_layout()
plt.show()

## Table 4

In [None]:
metrics = ['f1_weighted','precision_1','recall_1']

data_dict = {'Architecture': []}
for metric in metrics:
    data_dict[metric] = []

for path, label in paths:
    data_dict['Architecture'].append(label)
    for metric in metrics:
        data_dict[metric].append(round(parse_log(path, metric, part='tune')[-1]*100,2)) 
df = pd.DataFrame(data_dict)
df.rename(columns={'f1_weighted': 'F1',
                   'precision_1': 'Precision',
                   'recall_1': 'Recall'}, inplace=True)
df = df.reset_index(drop=True)

## Figure 3

In [None]:
metrics = ['f1','precision','recall']
for metric in metrics:
    extended_metrics = [metric+'_het',metric+'_homalt',metric+'_homref']

    data_dict = {'Architecture': []}
    for extended_metric in extended_metrics:
        data_dict[extended_metric] = []

    for path, label in paths:
        data_dict['Architecture'].append(label)
        for extended_metric in extended_metrics:
            data_dict[extended_metric].append(round(parse_log(path, extended_metric, part='tune')[-1]*100,2)) 

metrics = ['f1_het','f1_homalt','f1_homref']

data_dict = {'Architecture': []}
for metric in metrics:
    data_dict[metric] = []

for path, label in paths:
    data_dict['Architecture'].append(label)
    for metric in metrics:
        data_dict[metric].append(round(parse_log(path, metric, part='tune')[-1]*100,2)) 
df = pd.DataFrame(data_dict)
df.rename(columns={'f1_weighted': 'F1 (weighted)',
                   'f1_het': 'F1 (het)',
                   'f1_homalt': 'F1 (homalt)',
                   'f1_homref': 'F1 (homref)',
                   'precision_1': 'Precision',
                   'recall_1': 'Recall'}, inplace=True)
df = df.reset_index(drop=True)

In [None]:
# Sample data preparation 
metrics_f1 = ['f1_homref', 'f1_het', 'f1_homalt']
metrics_precision = ['precision_homref', 'precision_het', 'precision_homalt']
metrics_recall = ['recall_homref', 'recall_het', 'recall_homalt']

# Generate DataFrames for each metric type
def create_df(metrics):
    data_dict = {'Architecture': []}
    for metric in metrics:
        data_dict[metric] = []
    
    for path, label in paths: 
        data_dict['Architecture'].append(label)
        for metric in metrics:
            data_dict[metric].append(round(parse_log(path, metric, part='tune')[-1]*100, 2))
    
    df = pd.DataFrame(data_dict)

    df.rename(columns={
        f'{metrics[0]}': 'Class 0 (homozygous reference)',
        f'{metrics[1]}': 'Class 1 (heterozygous)',
        f'{metrics[2]}': 'Class 2 (homozygous alternate)'
    }, inplace=True)
    return df

df_f1 = create_df(metrics_f1)
df_precision = create_df(metrics_precision)
df_recall = create_df(metrics_recall)

def melt_df(df):
    return pd.melt(
        df,
        id_vars='Architecture',
        var_name='Metric',
        value_name='Value'
    )

melted_f1 = melt_df(df_f1)
melted_precision = melt_df(df_precision)
melted_recall = melt_df(df_recall)

In [None]:
df = melted_f1

architectures = df['Architecture'].unique()
colors = {
    arch: color for arch, color in zip(architectures, plt.cm.tab10.colors)
}

metrics = df['Metric'].unique()
patterns = {
    "Class 1 (heterozygous)": '',
    "Class 2 (homozygous alternate)": '//',
    "Class 0 (homozygous reference)": 'xx'
}

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 3))
bar_width = 0.2
x = range(len(architectures))

for i, metric in enumerate(metrics):
    for j, arch in enumerate(architectures):
        val = df[(df['Architecture'] == arch) & (df['Metric'] == metric)]['Value'].values[0]
        ax1.bar(
            j + i * bar_width - bar_width,
            val,
            width=bar_width,
            color=colors[arch],
            hatch=patterns[metric],
            edgecolor='black',
            label=f"{metric}" if j == 0 else ""
        )

ax1.set_xticks(range(len(architectures)))
ax1.set_xticklabels(architectures)
ax1.set_ylabel('F1')
ax1.set_title('Validation F1')
ax1.set_ylim((80,100))
ax1.tick_params(axis='x', rotation=90,labelsize=8)

df = melted_precision

for i, metric in enumerate(metrics):
    for j, arch in enumerate(architectures):
        val = df[(df['Architecture'] == arch) & (df['Metric'] == metric)]['Value'].values[0]
        ax2.bar(
            j + i * bar_width - bar_width,
            val,
            width=bar_width,
            color=colors[arch],
            hatch=patterns[metric],
            edgecolor='black',
            label=f"{metric}" if j == 0 else ""
        )


ax2.set_xticks(range(len(architectures)))
ax2.set_xticklabels(architectures)
ax2.set_ylabel('Precision')
ax2.set_title('Validation Precision')
ax2.set_ylim((80,100))
ax2.tick_params(axis='x', rotation=90,labelsize=8)


df = melted_recall

for i, metric in enumerate(metrics):
    for j, arch in enumerate(architectures):
        val = df[(df['Architecture'] == arch) & (df['Metric'] == metric)]['Value'].values[0]
        ax3.bar(
            j + i * bar_width - bar_width,
            val,
            width=bar_width,
            color=colors[arch],
            hatch=patterns[metric],
            edgecolor='black',
            label=f"{metric}" if j == 0 else ""
        )


ax3.set_xticks(range(len(architectures)))
ax3.set_xticklabels(architectures)
ax3.set_ylabel('Recall')
ax3.set_title('Validation Recall')
ax3.set_ylim((80,100))
ax3.tick_params(axis='x', rotation=90, labelsize=8)


arch_patches = [mpatches.Patch(color=colors[arch], label=arch) for arch in architectures]
metric_patches = [mpatches.Patch(facecolor='white', hatch=patterns[metric], edgecolor='black', label=metric) for metric in metrics]
ax3.legend(handles=metric_patches, bbox_to_anchor=(1.05, 1), loc='best')

for i, ax in enumerate((ax1, ax2, ax3)):
    ax.text(-0.2, 1.25, f'{chr(65+i)}', transform=ax.transAxes,
            fontsize=16, fontweight='bold', va='top')

plt.tight_layout()
plt.show()

## Table 5

In [16]:
paths = [
    ('experiment/results_test/inception-random_20-22/happy.output.summary.csv', 'Inception V3'),
    ('experiment/results_test/resnet152v2-random-h1_20-22/happy.output.summary.csv', 'ResNet-152 V2'),
    ('experiment/results_test/nasnetlarge-random-h1_20-22/happy.output.summary.csv', 'NASNet Large'),
    ('experiment/results_test/efficientnetb03-random-h1_20-22/happy.output.summary.csv', 'EfficientNet-B3'),
]

In [None]:
metrics = ['METRIC.F1_Score','METRIC.Precision','METRIC.Recall']

data_dict = {'Architecture': []}
for metric in metrics:
    data_dict[metric] = []

type = "SNP"
for (path, name) in paths:
    data_dict['Architecture'].append(name)
    df = pd.read_csv(path)
    df = df.loc[(df['Type']==type)&(df['Filter']=='PASS')]
    for metric in metrics:
        data_dict[metric].append(round(df[metric].iloc[0]*100,2))
    
df = pd.DataFrame(data_dict)   