In [None]:
import numpy as np
from pathlib import Path

In [None]:
AAs = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
       'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y',]

aa_index = {a: i for i, a in enumerate(AAs)}
aa_index

In [None]:
import gzip

def _iterate_lines(file_handle, aa_index):
    
    counts = np.zeros(len(aa_index), dtype=np.float64)
    
    header = True
    rows_in_total = 0
    hits_in_total = 0
    
    for line in file_handle:
        line = line.strip().split(',')

        if header:
            header = False
            peptide_at = line.index('peptide')
            if 'binder' in line:
                binder_at = line.index('binder')
            elif 'hit' in line:
                binder_at = line.index('hit')
            else:
                raise ValueError('could not identify bider/hit column')
            continue
        else:
            rows_in_total += 1

        if line[binder_at] == '1': # i.e. if hit
            peptide = line[peptide_at]
            hits_in_total += 1
            for a in peptide:
                counts[aa_index[a]] += 1
    
    print(rows_in_total, hits_in_total)
    
    return counts
    

def count_AAs(input_file, aa_index):
    
    if str(input_file).endswith('.gz') or str(input_file).endswith('.gzip'):
        with gzip.open(input_file, 'rt') as f:
            return _iterate_lines(f, aa_index)
    else:
        with open(input_file, 'r') as f:
            return _iterate_lines(f, aa_index)

## MHC class I

In [None]:
dataset_path = Path('/mnt/bfx/bfx_RD/Instadeep/cloud_backup/biondeep-data/datasets/mhc1/binding/MSDF_20200604/')

counts_train = count_AAs(dataset_path / 'MSDF_20200604_w_fixed_A0211_dedup.train.csv.gz', aa_index)
print(counts_train)

counts_tune = count_AAs(dataset_path / 'MSDF_20200604_w_fixed_A0211_dedup.tune.csv.gz', aa_index)
print(counts_tune)

counts_test = count_AAs(dataset_path / 'MSDF_20200604_w_fixed_A0211_dedup.test.csv.gz', aa_index)
print(counts_test)

counts = counts_train + counts_tune + counts_test
print(counts)

In [None]:
frequencies = counts / np.sum(counts)
frequencies

In [None]:
print('background_MS_data = {')
for a in AAs:
    print(f"    '{a}': {frequencies[aa_index[a]]:.6f},")
print('    }')

## MHC class II

In [None]:
dataset_path = Path('/mnt/bfx/bfx_RD/Instadeep/cloud_backup/biondeep-data/datasets/mhc2/binding/')

counts_train = count_AAs(dataset_path / 'train_drop_1to19.csv', aa_index)
print(counts_train)

counts_tune = count_AAs(dataset_path / 'tune_1to19.csv', aa_index)
print(counts_tune)

counts_test = count_AAs(dataset_path / 'test_scored_1to19.csv', aa_index)
print(counts_test)

counts = counts_train + counts_tune + counts_test
print(counts)

In [None]:
frequencies = counts / np.sum(counts)
frequencies

In [None]:
print('background_MS_data = {')
for a in AAs:
    print(f"    '{a}': {frequencies[aa_index[a]]:.6f},")
print('    }')