In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics

In [2]:
m = pd.read_csv('phenotype.tsv', sep='\t')

In [3]:
m = m.dropna()

In [4]:
res = []

# overall prediction expectation
y = m['commensal'].values
for _ in range(1000):
    y_hat = np.random.randint(0, 2, size=y.shape[0])
    f1 = metrics.f1_score(y, y_hat)
    res.append(('all', 'all', f1))

# for each phylogroup
for pgroup in m['phylogroup'].unique():
    y = m[m['phylogroup'] != pgroup]['commensal'].values
    for _ in range(1000):
        y_hat = np.random.randint(0, 2, size=y.shape[0])
        f1 = metrics.f1_score(y, y_hat)
        res.append((pgroup, 'all', f1))
        
# PE specific
for pe in ['pe_urinaire', 'pe_digestive']:
    n = pd.concat([m[m[pe] == 1],
                   m[m['commensal'] == 1]])
    
    # overall prediction expectation
    y = n['commensal'].values
    for _ in range(1000):
        y_hat = np.random.randint(0, 2, size=y.shape[0])
        f1 = metrics.f1_score(y, y_hat)
        res.append(('all', pe, f1))

    # for each phylogroup
    for pgroup in n['phylogroup'].unique():
        y = n[n['phylogroup'] != pgroup]['commensal'].values
        for _ in range(1000):
            y_hat = np.random.randint(0, 2, size=y.shape[0])
            f1 = metrics.f1_score(y, y_hat)
            res.append((pgroup, pe, f1))
        
r = pd.DataFrame(res, columns=['phylogroup', 'pe', 'f1'])

In [5]:
r.to_csv('random_f1.tsv.gz', sep='\t', index=False)

In [6]:
r.groupby(['phylogroup', 'pe']).mean().rename(columns={'f1': 'mean'}).join(r.groupby(['phylogroup', 'pe']).mad().rename(columns={'f1': 'mad'}))

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mad
phylogroup,pe,Unnamed: 2_level_1,Unnamed: 3_level_1
A,all,0.335985,0.014553
A,pe_digestive,0.504229,0.018143
A,pe_urinaire,0.414419,0.016571
B1,all,0.358217,0.012886
B1,pe_digestive,0.520931,0.01857
B1,pe_urinaire,0.435827,0.016394
B2,all,0.418885,0.017523
B2,pe_digestive,0.530271,0.021212
B2,pe_urinaire,0.51698,0.019723
C,all,0.372046,0.013214
