In [2]:
import os
import pandas as pd
import numpy as np

# Select Phages

In [2]:
protein_clusters_file ="PCs_GPD.txt"
protein_clusters = dict()
with open(protein_clusters_file) as myfile:
    for line in myfile.readlines():
        line = line.strip()
        rep = line.split("\t")[0]
        for phID in line.split("\t"):
            protein_clusters[phID] = rep
            
protein_seq_file = "GPD_proteome.faa"
phage_meta_df = pd.read_table("GPD_phage_metadata_renat.csv", sep = ",", index_col = 0)

In [3]:
def isNaN(num):
    return num != num

northamerica_phages = set()
for index, row in phage_meta_df.iterrows():
    if row["checkV_MIUViG"] == "High-quality":
        if not isNaN(row["Continents_detected"]):
            countrylist = row["Continents_detected"].split(",") 
            if 'North America' in countrylist :
                if countrylist.count("North America") > 50 :
                    northamerica_phages.add(index)

print("HQ (checkV_MIUViG = 'High-quality') prevalent (>50 samples) North American Phages", len(northamerica_phages))

HQ (checkV_MIUViG = 'High-quality') prevalent (>50 samples) North American Phages 112


In [9]:
set(phage_meta_df.loc[northamerica_phages,"Host_range_taxon"])


{'Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium angulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium pseudocatenulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium pseudocatenulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium angulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium pseudocatenulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium pseudocatenulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium pseudocatenulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacterium/Bifidobacterium pseudocatenulatum,Actinobacteriota/Actinobacteria/Actinomycetales/Bifidobacteriaceae/Bifidobacte

In [29]:
temp_meta = phage_meta_df[phage_meta_df["checkV_MIUViG"] == "High-quality"]
temp_meta = temp_meta[~isNaN(temp_meta["Continents_detected"])]
temp_meta = temp_meta[temp_meta["Continents_detected"].str.contains("North America")]
temp_meta["countries_number"] = temp_meta["Continents_detected"].str.count(",")+1
#temp_meta["countrylist"] = [row["Continents_detected"].split(",") for row in temp_meta]
temp_meta = temp_meta.sort_values("countries_number",ascending=False)
temp_meta[:10].index#.to_csv("top10_pilotPhages.csv")
#temp_meta_acc_set = set(temp_meta[:10].index)
#temp_meta_acc_set


Index(['uvig_361191', 'uvig_430938', 'uvig_409788', 'ivig_2899', 'ivig_2749',
       'ivig_1166', 'ivig_601', 'uvig_357328', 'uvig_385029', 'ivig_2154'],
      dtype='object', name='GPD_id')

In [35]:
na_protein_clusters = dict()
mark_for_write = False
count_X = 0
with open(protein_seq_file) as seq_file:
    for line in seq_file:
        if line.startswith(">"):
            identifier = line.strip()[1:]
            phageID = "_".join(identifier.split("_")[0:2])
            if phageID in northamerica_phages:
                mark_for_write = True
            else:
                mark_for_write = False
        else:
            if mark_for_write:
                if "X" not in line:
                    if identifier in protein_clusters:
                        cluster_name = protein_clusters[identifier]
                    else:
                        cluster_name = identifier
                    if cluster_name not in na_protein_clusters:
                        na_protein_clusters[cluster_name] = []
                    na_protein_clusters[cluster_name].append({"proteinID":identifier,"seq":line, "size":len(line)})
                    
                else:
                    count_X = count_X +1
                
print(len(na_protein_clusters), "protein clusters found, and additional",count_X,"proteins countain an X")

3354 protein clusters found, and additional 6 proteins countain an X


In [36]:
print("X, Number of Clusters with X Elements(Proteins)")
pd.Series([len(cl) for cl in na_protein_clusters.values()]).value_counts()

X, Number of Clusters with X Elements(Proteins)


1     1898
2      730
3      182
4      161
9       92
10      80
7       72
6       36
5       30
8       16
11      14
12      12
16       6
15       5
14       4
17       4
13       3
20       2
31       2
30       1
18       1
19       1
21       1
27       1
dtype: int64

In [1]:
217 + 39 + 68 + 56 + 75 + 96 + 144 + 154 + (8*16) + 150 + 216 + (7*72) + 800 + 828 + 1898 + (2 * 730) + (3 * 182) + (4 * 161)

8023

In [58]:
write_file_name = "phageome2_proteinClusterReps.faa"
proteinClusterReps = dict()
with open(write_file_name,'w') as write_file:
    for cluster in na_protein_clusters.values():
        identifier = ""
        longest = 0
        for protein in cluster:
            identifier += protein["proteinID"] +"_"
            if protein["size"]>longest:
                longest = protein["size"]
                representative_seq = protein["seq"]
                repID = protein["proteinID"]
                
        proteinClusterReps[">REP_"+repID+"_ALL_"+identifier[:-1]] = representative_seq
        write_file.write(">REP_"+repID+"_ALL_"+identifier[:-1] + "\n")
        write_file.write(representative_seq)


In [59]:
proteinClusterReps

{'>REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25': 'MAISAYIGIPGSGKSYEAVYNVIIPAFTSGRRVVTNIYGLQKDKITERYPDATGEIIVVDNDDVLKADFFPFKGGEGSFCQFGDLIVIDEAWRIFGSDKDMTAEKKSFIAEHRHFTHPETGISCDLVIVNQSLSNIARFLKDKIETTYRMRKLKALGLNNHYCIDVYSGHKIYKSNLVTSYRNKYNPDIFELYKSYEGNNGNEKQTDKRQSIWNSGKVRFFLVLFPLMFIGSGWLIYSFFSTFGRSDPSPDLTTTDVRDAAMFRSSAATPAPDTPSEPAEPPLSTEWRISGRMTSEGRAFVILVNGAGVLRAVPASSFNYKGMLMSGIIDGERVTLYTGKK\n',
 '>REP_uvig_80910_2_ALL_uvig_80910_2_uvig_80910_14_uvig_160801_15_uvig_213404_2_uvig_213404_14_uvig_225655_5_uvig_225655_18_uvig_359760_5_uvig_359760_19_uvig_379702_2_uvig_379702_14_uvig_512665_2_uvig_578696_10_uvig_578696_24': 'MFGILISALNTLLGFVFRSLIIKFVVFFALYFVVQGFVEILVELLPDSSNLSSLFANLSDGFWYFINLSKLPQGISMIISAMATRFIIRRIPVIG\n',
 '>REP_uvig_80910_3_ALL_uvig_80910_3_uvig_80910_15_uvig_160801_14_uvig_213404_3_uvig_213404_15

# Library Settings

In [92]:
#200 nt, 250 nt or 300 nt 
# 200-32 = 168 / 3 = 56 / 15 = 3 R 11
# 250-32 = 218 / 3 < 73 / 15 = 4 R 13
# 300-32 = 268 / 3 < 90 / 15 = 6 R 0


tile_size_aa = 56
no_epis_per_tile = 3
five_prime_adapter = "AGGAATTCCGCTGCGT"
three_prime_adapter_1 = "ATGGTCACAGCTGTGC"
three_prime_adapter_2 = "GTCGTGACTGGGAAAC"
len(five_prime_adapter)+len(three_prime_adapter)

32

using tile size 56 and 3 epitopes per tile, we end up with
23745 pepsyn tiles and 5669 dolphin tiles, together 29414 tiles
 
using tile size 73 and 4 epitopes per tile, we end up with
17557 pepsyn tiles and 3781 dolphin tiles, together 21338 tiles
 
using tile size 90 and 5 epitopes per tile, we end up with
14039 pepsyn tiles and 2763 dolphin tiles, together 16802 tiles


In [207]:
# Eval after cells down ran:

print("using tile size", tile_size_aa , "and", no_epis_per_tile, "epitopes per tile, we end up with")
print(len(pepsyn_all_tiles), "pepsyn tiles and", len(dolphin_tiles), "dolphin tiles, together", len(pepsyn_all_tiles)+ len(dolphin_tiles), "tiles" )

using tile size 56 and 3 epitopes per tile, we end up with
23745 pepsyn tiles and 5669 dolphin tiles, together 29414 tiles


![title](oligo_prices_lowerRange.png)

![title](oligo_prices.png)

In [3]:
l = 0
with open("phageome2_proteinClusterReps.faa") as seq_file:
    for line in seq_file:
        if not line.startswith(">"):
            seq = line[:-1]
            l+=len(seq)
print("length of all proteins together:",l,"aa")

length of all proteins together: 750776 aa


In [5]:
l = 0
with open("monkeyv_unique.faa") as seq_file:
    for line in seq_file:
        if not line.startswith(">"):
            seq = line[:-1]
            l+=len(seq)
print("length of all proteins together:",l,"aa")

length of all proteins together: 168130 aa


# Pepsyn tiling approach

In [6]:
def tileProtSeqWOverlap(aa_seq, tilesize, overlap = -1):
    # start tiles from both side, let "middle tile" over lap with already existing tiles
    # start second overlapping set from the middle in both directions to the side
    # ____   ___   ____ #
    #    ____   ____  #
    if overlap == -1:
        overlap = int(tilesize/2)
    tiles = []
    center = int(len(aa_seq)/2)
    seq = aa_seq[:center]
    pos = 0
    
    if len(aa_seq) >= tilesize:
        while len(seq) >= tilesize:
            tiles.append((seq[:tilesize],pos))
            seq = seq[tilesize-overlap:]
            pos = pos + tilesize - overlap
        tiles.append((aa_seq[pos:pos+tilesize],pos))
        seq = aa_seq[center:]
        pos = len(aa_seq) - tilesize
        while len(seq) >= tilesize:
            tiles.append((seq[-tilesize:],pos))
            seq = seq[:-(tilesize-overlap)]
            pos = pos-tilesize+overlap   
        last_tile = (aa_seq[pos:pos+tilesize],pos)
        if aa_seq[pos:pos+tilesize] != "" and last_tile not in tiles:
            tiles.append(last_tile)
    else:
        tiles.append((aa_seq, -1))
        
    #startindex_middletile = center-round(tilesize/2)
    #tiles.append((aa_seq[startindex_middletile:startindex_middletile+tilesize], startindex_middletile))
    return(tiles)

In [7]:
tilesize = tile_size_aa
overlap = int(tile_size_aa/2)

pepsyn_all_tiles = dict()

with open("phageome2_proteinClusterReps.faa") as rep_file:
    for line in rep_file:
        if not line.startswith(">"):
            seq = line.strip()
            tile_vec = tileProtSeqWOverlap(seq, tilesize, overlap)
            for tile, pos in tile_vec:
                if tile not in pepsyn_all_tiles:
                    pepsyn_all_tiles[tile] = []
                pepsyn_all_tiles[tile].append(str(protein)+"pos_"+str(pos))
        else:
            protein = line.strip()[1:]
                
print(len(pepsyn_all_tiles),"unique tiles")
no_pepsyn_tiles = len(pepsyn_all_tiles)

23745 unique tiles


In [44]:
#MONKEY LIB

tilesize = tile_size_aa
overlap = int(tile_size_aa/2)

monkey_all_tiles = dict()

with open("monkeyv_unique.faa") as rep_file:
    for line in rep_file:
        if not line.startswith(">"):
            seq = line.strip()
            tile_vec = tileProtSeqWOverlap(seq, tilesize, overlap)
            for tile, pos in tile_vec:
                if "X" in tile:
                    print("Skip tile containing X:", tile)
                else:
                    if tile not in monkey_all_tiles:
                        monkey_all_tiles[tile] = []
                    monkey_all_tiles[tile].append(str(protein)+"pos_"+str(pos))
        else:
            protein = line.strip()[1:]
                
print(len(monkey_all_tiles),"unique tiles")
no_monkey_tiles = len(monkey_all_tiles)

Skip tile containing X: EKASTGWYWKFPDVLTQTGVFGQNAQFHFLYRSGFCIHVQCNASKFHQGXLLVAIV
Skip tile containing X: FLYRSGFCIHVQCNASKFHQGXLLVAIVPEFVLGSESTEQKPNIAKHPEFNEVMPG
Skip tile containing X: NFSLVVIPVSPLEYQNGATTAIPITVTIAPLCSEXAGXRQAIKQGLPVEMKPGTNQ
Skip tile containing X: APLCSEXAGXRQAIKQGLPVEMKPGTNQFLTTDVGVSAPILPGFDPTPLIHIPGEV
Skip tile containing X: ATGKMLIAYTPPGGEQPTSRDLAMLGTHIIWDFGLQSSITLVVPWISNTHFRSVXT
Skip tile containing X: IIWDFGLQSSITLVVPWISNTHFRSVXTGGLRDYYATGIVTMWYQTNFVVPPKTPT
5536 unique tiles


In [233]:
#with open("phageome2_pepsyn_tiles_72.faa", "w") as pf:
    for seq, protArray in pepsyn_all_tiles.items():
        pf.write(">"+str(protArray)+"\n")
        pf.write(seq+"\n")

In [9]:
pepsyn_all_tiles

{'MAISAYIGIPGSGKSYEAVYNVIIPAFTSGRRVVTNIYGLQKDKITERYPDATGEI': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25pos_0'],
 'SGRRVVTNIYGLQKDKITERYPDATGEIIVVDNDDVLKADFFPFKGGEGSFCQFGD': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25pos_28'],
 'IVVDNDDVLKADFFPFKGGEGSFCQFGDLIVIDEAWRIFGSDKDMTAEKKSFIAEH': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25pos_56'],
 'LIVIDEAWRIFGSDKDMTAEKKSFIAEHRHFTHPETGISCDLVIVNQSLSNIARFL': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_

In [45]:
monkey_all_tiles

{'MGAQVSTQRTGSHETSNVVKDGSTLNFTNINFYRDSYAAAASKQDLSMDPSKFTQP': ['YP_009259673.1 polyprotein [enterovirus A114]pos_0'],
 'NINFYRDSYAAAASKQDLSMDPSKFTQPLLDAIRETAAPLQSPSAEACGYSDRVAQ': ['YP_009259673.1 polyprotein [enterovirus A114]pos_28'],
 'LLDAIRETAAPLQSPSAEACGYSDRVAQLTVGNSTITTQEAANIVVSYGEWPEYCP': ['YP_009259673.1 polyprotein [enterovirus A114]pos_56'],
 'LTVGNSTITTQEAANIVVSYGEWPEYCPDTDATAVDKPTRPDVSVNRFYTLPARLW': ['YP_009259673.1 polyprotein [enterovirus A114]pos_84'],
 'DTDATAVDKPTRPDVSVNRFYTLPARLWEKASTGWYWKFPDVLTQTGVFGQNAQFH': ['YP_009259673.1 polyprotein [enterovirus A114]pos_112'],
 'PEFVLGSESTEQKPNIAKHPEFNEVMPGQKGATFKHPYILDCGIPISQALVFPHQW': ['YP_009259673.1 polyprotein [enterovirus A114]pos_196'],
 'QKGATFKHPYILDCGIPISQALVFPHQWINLRTNNCATIVVPYINALAYDSAINHS': ['YP_009259673.1 polyprotein [enterovirus A114]pos_224'],
 'INLRTNNCATIVVPYINALAYDSAINHSNFSLVVIPVSPLEYQNGATTAIPITVTI': ['YP_009259673.1 polyprotein [enterovirus A114]pos_252'],
 'FLTTDVGVSAPILPGFDPTPLIHIPGEVSSLLELCRIETILEVNNTTRSVE

In [46]:
print("Number duplicated tiles")
pd.Series([len(ar) for ar in pepsyn_all_tiles.values()]).value_counts()

Number duplicated tiles


1    23741
2        4
dtype: int64

In [47]:
print("Number duplicated tiles in monkey")
pd.Series([len(ar) for ar in monkey_all_tiles.values()]).value_counts()

Number duplicated tiles in monkey


1    5535
2       1
dtype: int64

# Dolphin tiling approach

In [13]:
from sklearn.ensemble import RandomForestClassifier
import random
import time 
import math
import json
import matplotlib.pyplot as plt
import matplotlib

AA_FEAT = {'A':["sc_hydrophobic", "d_S"], 
           'R':["sc_poseleccharged", "d_K"], 
           'N':["sc_polaruncharged", "d_K"],
           'D':["sc_negeleccharged", "d_K"], 
           'C':["d_C"], 
           'Q':["sc_polaruncharged", "d_K"], 
           'E':["sc_negeleccharged", "d_K"], 
           'G':["d_G"],
           'H':["sc_poseleccharged", "d_H"], 
           'I':["sc_hydrophobic", "d_I"], 
           'L':["sc_hydrophobic", "d_I"], 
           'K':["sc_poseleccharged", "d_K"], 
           'M':["sc_hydrophobic", "d_M"],
           'F':["sc_hydrophobic", "d_F"], 
           'P':["d_P"], 
           'S':["sc_polaruncharged", "d_S"], 
           'T':["sc_polaruncharged", "d_S"], 
           'W':["sc_hydrophobic", "d_W"],
           'Y':["sc_hydrophobic", "d_Y"], 
           'V':["sc_hydrophobic", "d_I"]
          }

AA_DIAM = {'A' : "S", 'R': "K", 'N': "K", 'D': "K", 'C': "C", 'Q': "K", 'E': "K", 'G': "G", 'H': "H", 'I': "I", 'L': "I", 'K': "K", 'M': "M", 'F': "F", 'P': "P", 'S': "S", 'T': "S", 'W': "W", 'Y': "Y", 'V': "I"}
AMINOACIDS = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
DIAMONDS = ['K', 'C', 'G', 'H', 'I', 'M', 'F', 'Y', 'W', 'P', 'S']

def get_diam(aa_seq):
    diam = ""
    for aa in aa_seq:
        diam += AA_DIAM[aa]        
    return(diam)

def all_feature_names():
    feat_list = ["sc_hydrophobic","sc_polaruncharged","sc_poseleccharged","sc_negeleccharged"]
    for aa in AMINOACIDS:
        feat_list.append(aa)
    
    for d_aa in DIAMONDS:
        feat_list.append("d_" + d_aa)
        
    for aa1 in AMINOACIDS:
        for aa2 in AMINOACIDS:
            feat_list.append(aa1+aa2)
    
    for d_aa1 in DIAMONDS:
        for d_aa2 in DIAMONDS:
            feat_list.append("d_" + d_aa1 + d_aa2)
            
    return feat_list

FEATURE_LIST = all_feature_names()

def kmer_features_of_protein(seq, k):   
    feat = {}
    l = len(seq) 
    diam_seq = get_diam(seq)
    lk1 = l-k+1
    for f in FEATURE_LIST:
        feat[f] = [0] * (l-k+1)    
    feat = pd.DataFrame(feat)    
    for i in range(0,l):
        aa = seq[i]
        feat[aa][max(0,i-k+1):min(lk1,i+1)] += 1
        for f in AA_FEAT[aa]:
            feat[f][max(0,i-k+1):min(lk1,i+1)] += 1    
    rangelen = l-1
    for i in range(0, rangelen):
        double = seq[i:i+2]
        feat[double][max(0,i-k+2):min(lk1,i+1)] += 1
        diam_double = diam_seq[i:i+2]
        feat["d_" + diam_double][max(0,i-k+2):min(lk1,i+1)] += 1
    return(feat)  


def balance_trainDS(X, y, random_state):
    wildtypeIDs = set([item[0] for item in X.index.str.split("_")])
    random.seed(random_state)
    size_smaller_group = y.value_counts().min()
    pos_IDs = set(y[y==1].index)
    neg_IDs = set(y[y==0].index)
    neg_IDs= random.sample(neg_IDs, size_smaller_group)
    pos_IDs= random.sample(pos_IDs, size_smaller_group)
    trainIDs = set(pos_IDs + neg_IDs)
    y_train = y[trainIDs]
    X_train = X.loc[trainIDs,]
    
    return(X_train, y_train)

In [86]:
# TAKES TIME

# for training the model
feat = pd.read_table("training_features.csv", sep=",", index_col = 0)
labs = pd.read_table("training_labels.csv", sep=",", index_col = 0)

#### train the model
y=labs["reactivity_binary"]
y=y.astype('int')
X=feat

X_train, y_train = balance_trainDS(X, y, random_state = 10) 
print('Training Features Shape:', X_train.shape)

clf=RandomForestClassifier(n_estimators=100, n_jobs=6, random_state=42)
clf.fit(X_train,y_train)

# find the epitopes

teststop = 1
fewfortest = False
count = 0
epitile_size = 15
epitope_probability_cutoff = 0.5
global_epitopes = {}

start_time = time.time()
with open("phageome2_proteinClusterReps.faa", "r") as org:
    for line in org.readlines():
        line = line.strip()
        if fewfortest and count/2 > teststop:
            break
        count = count + 1
        
        if (count/2)%500 == 0:
            print(str(count/2), "sequences done")

        if line.startswith(">"):
            protein = line[1:]
        else:
            pseq = line
            protein_length = len(pseq)
            epitope_proba_atpos = {}

            prot_15mer_feat = kmer_features_of_protein(pseq,epitile_size)

            probas = [class_proba[1] for class_proba in clf.predict_proba(prot_15mer_feat)]
            for startpos in range(len(pseq)-epitile_size):
                #epitope_proba_atpos[startpos] = probas[startpos]
                #discuss with Ben if this makes sense downstream
                epitope_proba_atpos[startpos] = np.min(probas[startpos:startpos+3])

            while len(epitope_proba_atpos)>0:
                max_epi_proba_pos = max(epitope_proba_atpos, key=epitope_proba_atpos.get) 
                min_epi_proba = epitope_proba_atpos[max_epi_proba_pos]
                offset = np.argmax(probas[max_epi_proba_pos:max_epi_proba_pos+3])
                max_epi_proba_pos+=offset
                epi_proba = probas[max_epi_proba_pos]
                epi_seq = pseq[max_epi_proba_pos:max_epi_proba_pos+epitile_size]
                if min_epi_proba < epitope_probability_cutoff :
                    break
                else:
                    if epi_seq not in global_epitopes:
                        global_epitopes[epi_seq] = {"probability":epi_proba,"proteins":[],"start_pos":[]}   
                    global_epitopes[epi_seq]["proteins"].append(protein)                    
                    global_epitopes[epi_seq]["start_pos"].append(max_epi_proba_pos)                   

                    for pos in range(max_epi_proba_pos-epitile_size, max_epi_proba_pos+epitile_size):
                        if pos in epitope_proba_atpos:
                            del epitope_proba_atpos[pos]

            
print("classified all 15 mers")  
end_time = time.time()

print(str(int(end_time-start_time)) + "s elapsed")

500.0 sequences done
1000.0 sequences done
1500.0 sequences done
2000.0 sequences done
2500.0 sequences done
3000.0 sequences done
classified all 15 mers
847s elapsed


In [13]:
matplotlib.rcParams['figure.figsize'] = [15, 4]

plt.bar(np.arange(len(probas)),probas)
plt.axhline(0.5, color="gray")
plt.ylim(0, 1)
plt.show()

print(len(global_epitopes))
global_epitopes

NameError: name 'probas' is not defined

In [237]:
#with open("phageome2_dolphin_epitopes_15.faa", "w") as pf:
    for seq, epi in global_epitopes.items():
        pf.write(">"+str(epi["proteins"])+str(epi["start_pos"])+"\n")
        pf.write(seq+"\n")

In [70]:
#with open("epi_probas_phageome.json","w") as f:
    f.write(json.dumps(global_epitopes))

In [14]:
with open('epi_probas_phageome.json') as json_file:
    global_epitopes = json.load(json_file)

In [15]:
global_epitopes

{'HKIYKSNLVTSYRNK': {'probability': 0.66,
  'proteins': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25'],
  'start_pos': [169]},
 'TTYRMRKLKALGLNN': {'probability': 0.65,
  'proteins': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25'],
  'start_pos': [145]},
 'TDKRQSIWNSGKVRF': {'probability': 0.64,
  'proteins': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25'],
  'start_pos': [205]},
 'IYSFFSTFGRSDPSP': {'probability': 0.61,
  'proteins': ['REP_uvig_160801

In [16]:
len(global_epitopes)

19228

In [17]:
dolphin_tiles = {}
unused_epitopes = set()
epi_df = pd.DataFrame.from_dict(global_epitopes, orient='index')
epi_df["no_proteins"] = [len(v) for v in epi_df["proteins"]]
epi_df = epi_df.sort_values(by=["no_proteins", "probability"], ascending=False)
epis_to_be_treated = set(epi_df.index)

for epi in epi_df.index:
    if epi in epis_to_be_treated:
        #find epitopes which have the same protein set
        pset = epi_df.loc[epi,"proteins"]
        proteinset_epis=epi_df[epi_df["proteins"].apply(lambda x: x==pset)]
        epis_to_be_treated = epis_to_be_treated-set(proteinset_epis.index)
        
        #the following will drop epitopes which have lowest probability and have no partners to make tiles of 3
        no_tiles = int(len(proteinset_epis)/no_epis_per_tile) 
        tiles = ["" for _ in range(no_tiles)]
        tile_probas = [list() for _ in range(no_tiles)]
        tile_seq = ""
        epi_index = 0
        for stich in range(1,no_epis_per_tile+1):
            for tn in range(no_tiles):
                tile_probas[tn].append(proteinset_epis.iloc[epi_index,:]["probability"])
                sequence = proteinset_epis.iloc[epi_index,:].name
                epi_index += 1
                if stich == 1:
                    tiles[tn] = sequence
                else:
                    tiles[tn] = tiles[tn] + "GGGGS" + sequence                

        while len(proteinset_epis) > epi_index :
            unused_epitopes.add(proteinset_epis.iloc[epi_index,:].name)
            epi_index += 1
            
        for idx, tile in enumerate(tiles):
            dolphin_tiles[tile] = {"protein set":pset, 
                                  "tile number":(idx+1), 
                                  "tiles in protein(set)": no_tiles,
                                 "probabilities":tile_probas[idx]}
        
print("total number of dolphin tiles = ", len(dolphin_tiles))

total number of dolphin tiles =  5266


In [75]:
unused_epis_lowprob_fewProts

Unnamed: 0,probability,proteins,start_pos,no_proteins,prots
GFATPRAFLEAKALQ,0.5,[REP_uvig_428798_75_ALL_uvig_428798_75],[170],1,2
KLSETSAALGTLSKQ,0.5,[REP_uvig_356526_42_ALL_uvig_356526_42],[427],1,2
IARYEQKYVNSEYGL,0.5,[REP_ivig_3875_26_ALL_ivig_3875_26],[278],1,2
IEMVAAMALEAVRQQ,0.5,[REP_uvig_385029_15_ALL_uvig_385029_15],[107],1,2
VNPLTGYEYEYEKEV,0.5,[REP_uvig_356792_6_ALL_uvig_356792_6],[59],1,2
...,...,...,...,...,...
GAKEINKGLPGAGMT,0.5,[REP_ivig_464_22_ALL_ivig_464_22],[222],1,2
LLDAPADGRDEVKPK,0.5,[REP_uvig_429694_16_ALL_uvig_429694_16],[152],1,2
TISYRVQGFFDRVTL,0.5,[REP_uvig_356577_3_ALL_uvig_356577_3],[447],1,2
ALKNWLKKCGFVDIR,0.5,[REP_uvig_385029_37_ALL_uvig_385029_37],[256],1,2


In [81]:
unused_epis = epi_df.loc[unused_epitopes,]
unused_epis_lowprob = unused_epis[unused_epis["probability"]==0.5]
print(len(unused_epis_lowprob))
unused_epis_lowprob["prots"] = [str(s).count("vig") for s in unused_epis_lowprob["proteins"]]
unused_epis_lowprob_fewProts = unused_epis_lowprob[unused_epis_lowprob["prots"]<3]
print(len(unused_epis_lowprob_fewProts))
random.seed(4)
no_synth = random.sample(set(unused_epis_lowprob_fewProts.index),111, )
print(len(no_synth))
no_synth

732
402
111


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unused_epis_lowprob["prots"] = [str(s).count("vig") for s in unused_epis_lowprob["proteins"]]


['TMCYGCKALHKTCGG',
 'QNRPMFRTGSADVEP',
 'CFCKKKKEGVLVMYL',
 'FTWEKVTTIDVGFDL',
 'EHNENVEWYKNNTDA',
 'GLSDYLDVYPDGELK',
 'NPLGSNTIYDYSYFA',
 'DYEAYVKLVNEYKEQ',
 'SSKFFIETKDYLSAV',
 'DKIAILILLNIYIEN',
 'YVYEKVFENSRTVMR',
 'PLYPQISTKISSSTK',
 'VTLEKERFIYQDMKI',
 'YNNESLIEYQGYFSY',
 'EKIKEYVESSKFLTY',
 'LFSESKFKHQSTLKL',
 'IDYDAIKLQKELESL',
 'RRLLDVCYARSDAWE',
 'FQAHHDQFNGTSEEF',
 'SFREWCSDPKAKEPR',
 'RQEARKAKNFARADE',
 'ATLGASFLPLMLSKG',
 'IGKVPFSQWLTERVD',
 'LGNVKRVPADKVLIM',
 'TWGVFKEKLIEGLNE',
 'YFALTAYTRSLFHWL',
 'SAAAKAQDPVATARP',
 'VDYVRHLMAEVNSSH',
 'LEAAMAFRELGFMHL',
 'MTGLAGISTSAMSAT',
 'SFLVIDEAEGFINFQ',
 'QMRRSEGIRFEQNNN',
 'VEDLIKNHDFHNNRL',
 'EHFVAFPEIELYDDT',
 'QLAQMQQAVTKYATR',
 'PDKNAYHRPVTDIIP',
 'DSFEPQQRISLLIWT',
 'GNIDAKAVVLLRPAW',
 'HMEAQRHREEIEFTR',
 'LLDDNLWQLVNGWLC',
 'EKHLKSKAYPEAEDE',
 'RNISKGMYKVKLPCD',
 'NINYGLTMRYLGMSM',
 'ASKILAGVKDAPAFP',
 'GEAKKYVSEEELEHL',
 'VAKINAPDGADIAKI',
 'LPDADGTINEVYIYQ',
 'IYYSVIFQNPKYPMM',
 'WGAAVAFPSYFPTGT',
 'YLLVNKSIRRGYKSM',


In [80]:
dolphin_tiles

{'GLGWVQRDRRLASQKGGGGSQYRDDVIRNARLEWGGGGGSADFAAQLHQESGWRP': {'protein set': ['REP_ivig_1427_30_ALL_ivig_1427_30_uvig_205260_10_uvig_250342_81_uvig_315372_35_uvig_396772_27',
   'REP_ivig_4427_24_ALL_ivig_4427_24'],
  'tile number': 1,
  'tiles in protein(set)': 2,
  'probabilities': [0.73, 0.61, 0.59]},
 'ASWRENRHYPQRILRGGGGSIRALVSYDRWLWQRVGGGGSTRWFGHVATVNAGRN': {'protein set': ['REP_ivig_1427_30_ALL_ivig_1427_30_uvig_205260_10_uvig_250342_81_uvig_315372_35_uvig_396772_27',
   'REP_ivig_4427_24_ALL_ivig_4427_24'],
  'tile number': 2,
  'tiles in protein(set)': 2,
  'probabilities': [0.62, 0.6, 0.52]},
 'DEIIKGKWGSNPERRGGGGSTAKYGAAAYEAAQARGGGGSVNEIMGAATIKSKTA': {'protein set': ['REP_uvig_356033_19_ALL_uvig_355478_40_uvig_356033_19_uvig_358303_12',
   'REP_uvig_356033_19_ALL_uvig_355478_40_uvig_356033_19_uvig_358303_12'],
  'tile number': 1,
  'tiles in protein(set)': 1,
  'probabilities': [0.62, 0.53, 0.5]},
 'KNSENKYFLKVYKYIGGGGSRAFVQKFDADAQNAYGGGGSSQNLLSLNFTKNVYY': {'protein set': ['R

In [254]:
#with open("phageome2_dolphin_tiles_3epis.faa", "w") as pf:
    for seq, tile in dolphin_tiles.items():
        pf.write(">"+str(tile["protein set"])+"_"+str(tile["tile number"])+"of"+str(tile["tiles in protein(set)"])+"\n")
        pf.write(seq+"\n")

In [221]:
print("X, Number of tiles representing X proteins")
print(pd.Series([len(ar["protein set"]) for ar in dolphin_tiles.values()]).value_counts())
print("When a protein (set) does not have",no_epis_per_tile,"unique epitopes, it will not be respresented")

X, Number of tiles representing X proteins
1    5262
2       4
dtype: int64
When a protein (set) does not have 3 unique epitopes, it will not be respresented


# Build validation DS

In [3]:
hfc = pd.read_table("../avarda/hfc_pubEpitopes.csv", index_col = 0, sep = ",")
rc = pd.read_table("../20180108_pep_RC.txt", index_col = 0, sep = "\t")
hfc.shape

(46070, 475)

In [4]:
beads15mers = rc.iloc[["_15mer" in rowname for rowname in rc.index],["BEADS_ONLY" in colname for colname in rc.columns]]
reactivity_15mers = pd.DataFrame(columns=["reactivity", "samples", "reactivity_binary"])
for index, row in hfc.iterrows():
    if "_15mer" in index : 
        seq = index.split("_")[-1][-15:]
        r = row.sum()
        reactivity_15mers.loc[index, "reactivity"] = r
        reactivity_15mers.loc[index, "samples"] = sum(row > 0)         
        bc = beads15mers.loc[index,].sum()         
        reactivity_15mers.loc[index, "beadcounts"] = bc        
        reactivity_15mers.loc[index, "reactivity_binary"] = 1 if sum(row > 0) > 1 else 0           
        reactivity_15mers.loc[index, "reactivity_m_beadcounts"] = bc*r  

In [5]:
len(reactivity_15mers)

3456

In [6]:
beads15mers.loc["45373_15mer_start_31_GGGGSLYAKAVTQVKVTDPA",]#.sum()

HIV3_plate_1.A1.BEADS_ONLY.1       4
HIV3_plate_1.D11.BEADS_ONLY.7      0
HIV3_plate_1.D12.BEADS_ONLY.8      8
HIV3_plate_1.D3.BEADS_ONLY.4       3
HIV3_plate_1.E2.BEADS_ONLY.2       7
HIV3_plate_1.E8.BEADS_ONLY.6       4
HIV3_plate_1.G2.BEADS_ONLY.3       5
HIV3_plate_1.H3.BEADS_ONLY.5       3
HIV3_plate_2.B2.BEADS_ONLY.9      10
HIV3_plate_2.B5.BEADS_ONLY.11      2
HIV3_plate_2.B8.BEADS_ONLY.15      2
HIV3_plate_2.D5.BEADS_ONLY.12      6
HIV3_plate_2.E9.BEADS_ONLY.16      5
HIV3_plate_2.F2.BEADS_ONLY.10      2
HIV3_plate_2.F7.BEADS_ONLY.13      7
HIV3_plate_2.H7.BEADS_ONLY.14      7
HIV3_plate_3.C9.BEADS_ONLY.21      1
HIV3_plate_3.D3.BEADS_ONLY.18      1
HIV3_plate_3.G12.BEADS_ONLY.22     5
HIV3_plate_3.G6.BEADS_ONLY.19      3
HIV3_plate_3.G8.BEADS_ONLY.20      0
HIV3_plate_3.H2.BEADS_ONLY.17      0
HIV3_plate_4.A5.BEADS_ONLY.23      5
HIV3_plate_4.G10.BEADS_ONLY.25     4
HIV3_plate_4.G11.BEADS_ONLY.26     0
HIV3_plate_4.H12.BEADS_ONLY.27     5
HIV3_plate_4.H8.BEADS_ONLY.24      8
H

In [7]:
reactivity_15mers[reactivity_15mers["samples"]<3][reactivity_15mers["samples"] > 0][reactivity_15mers["reactivity"] > 6][reactivity_15mers["beadcounts"] > 550].sort_values(by=["reactivity_m_beadcounts", "samples", "reactivity"], ascending=[False,True,False])


  reactivity_15mers[reactivity_15mers["samples"]<3][reactivity_15mers["samples"] > 0][reactivity_15mers["reactivity"] > 6][reactivity_15mers["beadcounts"] > 550].sort_values(by=["reactivity_m_beadcounts", "samples", "reactivity"], ascending=[False,True,False])
  reactivity_15mers[reactivity_15mers["samples"]<3][reactivity_15mers["samples"] > 0][reactivity_15mers["reactivity"] > 6][reactivity_15mers["beadcounts"] > 550].sort_values(by=["reactivity_m_beadcounts", "samples", "reactivity"], ascending=[False,True,False])


Unnamed: 0,reactivity,samples,reactivity_binary,beadcounts,reactivity_m_beadcounts
12155_15mer_Cterm_GGGGSAVKAGMPTTRIIAPL,95.08,2,1,1065.0,101260.20
8452_15mer_start_16_GGGGSSKWWEPAAARALERA,65.69,1,0,1055.0,69302.95
83419_15mer_start_6_GGGGSNPIRVKRPKKPIAKR,25.53,1,0,2471.0,63084.63
38957_15mer_start_6_GGGGSIDAAKSMATNVMKKF,19.12,2,1,2392.0,45735.04
68678_15mer_start_36_GGGGSYGSGPALSTKTKFWR,22.73,2,1,1728.0,39277.44
...,...,...,...,...,...
11926_15mer_Cterm_GGGGSLYTPFNMSGREQVTV,7.63,1,0,643.0,4906.09
65043_15mer_start_26_GGGGSQEPQQQEPQQQEPQQ,6.56,1,0,722.0,4736.32
85530_15mer_start_11_GGGGSASRSVIRSIIKSSRI,6.11,1,0,775.0,4735.25
43466_15mer_start_1_GGGGSLGVGLSGHPLYNKLD,7.65,1,0,586.0,4482.90


In [8]:
reactive_15mers = dict()
count = 0
r15s=reactivity_15mers.sort_values(by="samples", ascending=False)
seqs = set()
for index, row in r15s.iterrows():
    s = index.split("_")
    protID = s[0]
    seq = s[-1][-15:]
    if protID not in reactive_15mers and seq not in seqs:
        seqs.add(seq)
        reactive_15mers[protID] = {"samples" : row["samples"], "beadcounts": beads15mers.loc[index,].sum(), "sequence" : seq}
        count += 1
    if count == 48:
        break
        
nonreactive_15mers = dict()
count = 0
#r15s=reactivity_15mers.sort_values(by=["samples","beadcounts"], ascending=[True,False])
r15s=reactivity_15mers[reactivity_15mers["samples"]<3][reactivity_15mers["samples"] > 0][reactivity_15mers["reactivity"] > 6][reactivity_15mers["beadcounts"] > 550].sort_values(by=["reactivity_m_beadcounts", "samples", "reactivity"], ascending=[False,True,False])
for index, row in r15s.iterrows():
    s = index.split("_")
    protID = s[0]
    seq = s[-1][-15:]
    if protID not in nonreactive_15mers and protID not in reactive_15mers and seq not in seqs:
        seqs.add(seq)
        nonreactive_15mers[protID] = {"samples" : row["samples"], "beadcounts": beads15mers.loc[index,].sum(), "sequence" : seq}
        count += 1
    if count == 96:
        break


  r15s=reactivity_15mers[reactivity_15mers["samples"]<3][reactivity_15mers["samples"] > 0][reactivity_15mers["reactivity"] > 6][reactivity_15mers["beadcounts"] > 550].sort_values(by=["reactivity_m_beadcounts", "samples", "reactivity"], ascending=[False,True,False])
  r15s=reactivity_15mers[reactivity_15mers["samples"]<3][reactivity_15mers["samples"] > 0][reactivity_15mers["reactivity"] > 6][reactivity_15mers["beadcounts"] > 550].sort_values(by=["reactivity_m_beadcounts", "samples", "reactivity"], ascending=[False,True,False])


In [11]:
reactive_15mers

{'22110': {'samples': 433, 'beadcounts': 1495, 'sequence': 'GIWGCSGKLICTTAV'},
 '31648': {'samples': 433, 'beadcounts': 532, 'sequence': 'LGIWGCSGKLICTTA'},
 '23996': {'samples': 431, 'beadcounts': 1193, 'sequence': 'GIWGCSGKIICPTNV'},
 '65616': {'samples': 415, 'beadcounts': 624, 'sequence': 'RLNLWGCKGKLICYT'},
 '33029': {'samples': 408, 'beadcounts': 926, 'sequence': 'GQPHDTAPRGARKKQ'},
 '60049': {'samples': 392, 'beadcounts': 3837, 'sequence': 'KAPKRIRLPHIREDD'},
 '78231': {'samples': 382, 'beadcounts': 1355, 'sequence': 'TRKGIHLGPGQTFYA'},
 '52913': {'samples': 373, 'beadcounts': 2450, 'sequence': 'RPQKRPSCIGCKGAH'},
 '58836': {'samples': 371, 'beadcounts': 764, 'sequence': 'IPALTAAETGHTSQV'},
 '93698': {'samples': 363, 'beadcounts': 1808, 'sequence': 'YVDEVLNEVLVVPNI'},
 '20718': {'samples': 361, 'beadcounts': 847, 'sequence': 'APKRLRLPHIRDDDA'},
 '77385': {'samples': 361, 'beadcounts': 1321, 'sequence': 'TRKSIHIGPGRAFYA'},
 '95293': {'samples': 360, 'beadcounts': 430, 'sequence':

In [12]:
nonreactive_15mers

{'12155': {'samples': 2, 'beadcounts': 1065, 'sequence': 'AVKAGMPTTRIIAPL'},
 '8452': {'samples': 1, 'beadcounts': 1055, 'sequence': 'SKWWEPAAARALERA'},
 '83419': {'samples': 1, 'beadcounts': 2471, 'sequence': 'NPIRVKRPKKPIAKR'},
 '38957': {'samples': 2, 'beadcounts': 2392, 'sequence': 'IDAAKSMATNVMKKF'},
 '68678': {'samples': 2, 'beadcounts': 1728, 'sequence': 'YGSGPALSTKTKFWR'},
 '58373': {'samples': 2, 'beadcounts': 1498, 'sequence': 'TTPLVLLSAGPSMIS'},
 '21857': {'samples': 2, 'beadcounts': 905, 'sequence': 'KSKKKAQQAAADTGH'},
 '2933': {'samples': 2, 'beadcounts': 2499, 'sequence': 'TCWAICKRIPNKKPG'},
 '83499': {'samples': 2, 'beadcounts': 1159, 'sequence': 'RLGGKEDRRVKQSRG'},
 '3954': {'samples': 2, 'beadcounts': 2346, 'sequence': 'VRDGVVDNVIFTNKT'},
 '31723': {'samples': 1, 'beadcounts': 2464, 'sequence': 'DRIRERAEDSGNESD'},
 '25297': {'samples': 2, 'beadcounts': 1806, 'sequence': 'LKAWEERQQNLQQRQ'},
 '32852': {'samples': 2, 'beadcounts': 2246, 'sequence': 'PITKTNKIVGLNYTK'},
 '3

In [25]:
len(nonreactive_15mers)

96

In [26]:
validationDS = dict()

# 24 REACTIVE 15mers stand-alone

for protID, details in reactive_15mers.items():
    if details["sequence"] in validationDS:
        print(details["sequence"])
    validationDS[details["sequence"]] = "15mer_reactive_" + str(protID)

#####################################

# 48 NON REACTIVE 15mers stand-alone

for protID, details in nonreactive_15mers.items():
    validationDS[details["sequence"]] = "15mer_nonreactive_" + str(protID)

#####################################

# 24 REACTIVE on 8 tiles, 3 per tile
# BOTH LINKERS

tileG = ""
nameG = "3_different_reactive_G_"
tile = ""
name = "3_different_reactive_G4S_"
count = 0
for protID, details in reactive_15mers.items():
    tile += details["sequence"]
    name += str(protID)
    tileG += details["sequence"]
    nameG += str(protID)
    count += 1
    if count % 3 == 0:
        validationDS[tile] = name
        validationDS[tileG] = nameG
        tileG = ""
        nameG = "3_different_reactive_G_"
        tile = ""
        name = "3_different_reactive_G4S_"
    else:
        tile += "GGGGS"
        name += "-"
        tileG += "G"
        nameG += "-"

######################################

# 48 NON REACTIVE on 16 tiles, 3 per tile
# BOTH LINKERS

tileG = ""
nameG = "3_different_nonreactive_G_"
tile = ""
name = "3_different_nonreactive_G4S_"
count = 0
for protID, details in nonreactive_15mers.items():
    tile += details["sequence"]
    name += str(protID)
    tileG += details["sequence"]
    nameG += str(protID)
    count += 1
    if count % 3 == 0:
        validationDS[tile] = name
        validationDS[tileG] = nameG
        tileG = ""
        nameG = "3_different_nonreactive_G_"
        tile = ""
        name = "3_different_nonreactive_G4S_"
    else:
        tile += "GGGGS"
        name += "-"
        tileG += "G"
        nameG += "-"
    
######################################

# NOT DOING 48 NON REACTIVE ONES three times repeated

######################################

#24 REACTIVE three times repeated
# BOTH LINKERS

#for protID, details in reactive_15mers.items():
#    seq = details["sequence"]
#    validationDS[seq+"GGGGS"+seq+"GGGGS"+seq] = "3_same_reactive_G4S_" + str(protID)
#    validationDS[seq+"G"+seq+"G"+seq] = "3_same_reactive_G_" + str(protID)
    
######################################

# 24 REACTIVE stichted with two NON-REACTIVE
# only at first position 
# BOTH LINKERS

#shuffle these ones randomly

tiles = []
tilesG = []
names = []
namesG = []

all_15mers = reactive_15mers.copy()
for protID, details in all_15mers.items():
    all_15mers[protID]["reactive"] = "r"
for protID, details in nonreactive_15mers.items():
    all_15mers[protID] = details
    all_15mers[protID]["reactive"] = "n"
    
shuffled_keys = [a for a in all_15mers.keys()]
random.seed(7)
random.shuffle(shuffled_keys)

tileG = ""
nameG = "combined_reactive_pos1_G_"
tile = ""
name = "combined_reactive_pos1_G4S_"
count = 0
for k in shuffled_keys:
    print(all_15mers[k]["reactive"])
    tile += all_15mers[k]["sequence"]
    name += str(k)+all_15mers[k]["reactive"]
    tileG += all_15mers[k]["sequence"]
    nameG += str(k)+all_15mers[k]["reactive"]
    count += 1
    if count % 3 == 0:
        print("\n")
        validationDS[tile] = name
        validationDS[tileG] = nameG
        tileG = ""
        nameG = "combined_reactive_pos1_G_"
        tile = ""
        name = "combined_reactive_pos1_G4S_"
    else:
        tile += "GGGGS"
        name += "-"
        tileG += "G"
        nameG += "-"


#for protID, details in reactive_15mers.items():
#    seq = details["sequence"]
#    tiles.append(seq)
#    tilesG.append(seq)
#    names.append("combined_reactive_pos1_G4S_"+protID)
#    namesG.append("combined_reactive_pos1_G_"+protID)
    
#c = 0

#for protID, details in nonreactive_15mers.items():
#    seq = details["sequence"]
#    tiles[c] += "GGGGS"+seq
#    tilesG[c] += "G"+seq
#    names[c] += "-"+protID
#    namesG[c] += "-"+protID
#    c += 1
#    c = c % len(tiles)

#for i in range(len(tiles)):
#    validationDS[tiles[i]] = names[i]
#    validationDS[tilesG[i]] = namesG[i]

r
n
n


n
n
r


r
n
n


r
n
n


r
n
n


n
n
n


n
n
r


n
n
r


n
n
n


r
n
n


r
n
n


n
r
r


n
r
n


n
n
r


n
n
n


n
r
n


r
n
r


n
r
r


r
n
n


n
r
n


n
n
r


n
n
r


n
r
n


n
n
r


n
n
r


n
r
n


n
n
r


n
n
n


r
n
n


n
r
n


n
r
n


n
n
n


r
r
n


n
r
n


n
n
n


n
r
n


n
r
n


r
n
n


n
n
n


n
n
r


r
n
n


r
n
n


r
n
r


n
n
r


r
n
n


r
n
r


n
r
r


n
r
n




In [27]:
print(len(validationDS))
validationDS
c = 0
for k in validationDS:
    if validationDS[k].startswith("15mer_nonrea"):
        c+=1
        
print(c)

336
96



# Build final library

In [1]:
prot_file = "phageome_tiles_proteins.faa"
dna_file_noadap = "phageome_tiles_oligos_noadap.fasta"
dna_file = "phageome_tiles_oligos.fasta"
anno_file = "phageome_tiles_annotation.csv"

In [29]:
global_epitopes

{'HKIYKSNLVTSYRNK': {'probability': 0.66,
  'proteins': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25'],
  'start_pos': [169]},
 'TTYRMRKLKALGLNN': {'probability': 0.65,
  'proteins': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25'],
  'start_pos': [145]},
 'TDKRQSIWNSGKVRF': {'probability': 0.64,
  'proteins': ['REP_uvig_160801_16_ALL_uvig_80910_1_uvig_80910_13_uvig_160801_16_uvig_213404_1_uvig_213404_13_uvig_225655_4_uvig_225655_17_uvig_359760_4_uvig_359760_18_uvig_379702_1_uvig_379702_13_uvig_512665_1_uvig_578696_11_uvig_578696_25'],
  'start_pos': [205]},
 'IYSFFSTFGRSDPSP': {'probability': 0.61,
  'proteins': ['REP_uvig_160801

In [40]:
198/3

66.0

In [237]:
#junkseq = "***KPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLG"

junkseq(0)

''

In [82]:
#no_synth = []

In [83]:
anno = dict()
def junkseq(length):
    return(("***" + "".join(random.choice(AMINOACIDS) for i in range(length)))[:length])

with open(prot_file, "w") as pf:
    for seq, tile in dolphin_tiles.items():
        n = str(tile["protein set"])+"_"+str(tile["tile number"])+"of"+str(tile["tiles in protein(set)"])
        h = abs(hash(n))
        anno["dolphin_"+str(h)] = "dolphin_"+n
        pf.write(">"+"dolphin_"+str(h)+"\n")
        pf.write(seq+"*\n")
        
    for seq, prots in pepsyn_all_tiles.items():
        n = str(prots)
        h = abs(hash(n))
        anno["pepsyn_"+str(h)] = "pepsyn_"+n
        pf.write(">"+"pepsyn_"+str(h)+"\n")
        pf.write(seq + junkseq(56-len(seq))+"\n")
        
    for seq, tile in global_epitopes.items():
        if seq not in no_synth:
            n = str(tile["proteins"])+"_"+str(tile["start_pos"])+"_"+str(tile["probability"])
            h = abs(hash(n))
            anno["dolphinepitopes_"+str(h)] = "dolphinepitopes_"+n
            pf.write(">"+"dolphinepitopes_"+str(h)+"\n")
            pf.write("GGGGS" + seq+ junkseq(36) + "\n")

    for seq, name in validationDS.items():
        h = abs(hash(name))
        anno["validationDS_"+str(h)] = "dolphinepitopes_"+name
        pf.write(">"+"validationDS_"+str(h)+"\n")
        pf.write(seq+junkseq(56-len(seq))+"\n")
        
    for seq, prots in monkey_all_tiles.items():
        n = str(prots)
        h = abs(hash(n))
        anno["monkeyvir_"+str(h)] = "monkeyvir_"+n
        pf.write(">"+"monkeyvir_"+str(h)+"\n")
        pf.write(seq + junkseq(56-len(seq))+"\n")
        #pf.write(seq+"\n")
        

In [97]:
with open(anno_file, "w") as pf:
    pf.write("tile_id\tallinfo\n")
    for k, v in anno.items():
        pf.write(k+"\t"+v+"\n")
        

In [84]:
revtrans_command = "pepsyn revtrans "+ prot_file + " " + dna_file_noadap
revtrans_command

'pepsyn revtrans phageome_tiles_proteins.faa phageome_tiles_oligos_noadap.fasta'

In [85]:
stream = os.popen(revtrans_command)
output = stream.read().strip()
output

''

In [93]:
with open(dna_file_noadap, "r") as na:
    with open(dna_file, "w") as wa:
        lib = ">some"
        for line in na:
            l = line.strip()
            if not line.startswith(">"):
                if lib.startswith(">monkey"):
                    l = five_prime_adapter + l + three_prime_adapter_2
                else:
                    l = five_prime_adapter + l + three_prime_adapter_1
            else:
                lib = line.strip()
            wa.write(l+"\n")

In [87]:
lengths = []
c = 0
with open(dna_file, "r") as wa:
    for line in wa:
        line = line.strip()
        if not line.startswith(">"):
            c+=1
            lengths.append(len(line))
            if len(line) < 200:
                print(name, len(line))
        else:
            name = line
            
print(c)
np.unique(lengths, return_counts=True)

54000


(array([200]), array([54000]))

In [41]:
dolp = 0
epis = 0
vds = 0
peps = 0
monk = 0
total = 0
with open(dna_file, "r") as wa:
    for line in wa:
        total +=1
        if line.startswith(">dolphin_"):
            dolp += 1
        elif line.startswith(">dolphinepitopes_"):
            epis += 1
        elif line.startswith(">pep"):
            peps += 1
        elif line.startswith(">valid"):
            vds += 1
        elif line.startswith(">monkey"):
            monk += 1
            
print("total:", total/2 , "dolphin:", dolp, "epitopes:", epis, "pepsyn:", peps, "validationDS:", vds, "MonkeyVir:", monk)

total: 54000.0 dolphin: 5266 epitopes: 19117 pepsyn: 23745 validationDS: 336 MonkeyVir: 5536


In [252]:
95958/2

47979.0

In [None]:
# naming, remove special chars and make hash names
# ben adapter lengths which exactly?
# non reactive ones to be the almost reactove ones and all represented

In [133]:
t= "1234567890ABCDETTAGTAGGTGTGATTTGCGGCTGGAAATGGACCTATAGTCGACCGGGTTCCACTGGAGGAAATTTTTGGATGAATCTGCTTGGACCTAAGACGTTTCGTTTTTGGCTGGGGGTCATTTTAGCGGTCGGCATTGGCCTTTCGTTATATCTGTTTTCCATATCAGGAAAATGA1234567890ABCDE"

In [134]:
len(t)

201

In [106]:
beads40mers = rc.iloc[["_40mer" in rowname for rowname in rc.index],["BEADS_ONLY" in colname for colname in rc.columns]]
reactivity_40mers = pd.DataFrame(columns=["reactivity", "samples", "reactivity_binary"])
for index, row in hfc.iterrows():
    if "_40mer" in index : 
        seq = index.split("_")[-1][-15:]
        reactivity_40mers.loc[index, "reactivity"] = row.sum()
        reactivity_40mers.loc[index, "samples"] = sum(row > 0)         
        reactivity_40mers.loc[index, "beadcounts"] = beads40mers.loc[index,].sum()         
        reactivity_40mers.loc[index, "reactivity_binary"] = 1 if sum(row > 0) > 1 else 0   

In [107]:
reactivity_40mers.sort_values(by=["samples","beadcounts"], ascending=[True,False])

Unnamed: 0,reactivity,samples,reactivity_binary,beadcounts
88229_40mer_start_6_GGGGSKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLG,0.0,0,0,3306.0
52561_40mer_start_1_GGGGSYSVSSCDLRMGSGFCIDYALPSSRRKRRGISSPYRFVTFE,0.0,0,0,3221.0
13809_40mer_start_16_GGGGSCASLLATTSSATRRWKAKSGTASGVGASETCGTWYLPNHC,0.0,0,0,3136.0
2253_40mer_start_11_GGGGSDPFYAQMFTQRTESTSSTWSDSSVSSTYFSKSKQPSTSSS,0.0,0,0,2988.0
90399_40mer_start_6_GGGGSAVVAERVAIPCTSEYATPIPTPRAVRVVPVPAPRIQRAST,0.0,0,0,2924.0
...,...,...,...,...
56504_40mer_start_11_GGGGSNPPKKPKDDYHFEVFNFVPCSICGNNQLCKSICKTIPSNK,19075.86,436,1,942.0
2933_40mer_start_11_GGGGSQNKPNNDFHFEVFNFVPCSICSNNPTCWAICKRIPNKKPG,15627.59,436,1,788.0
56504_40mer_start_6_GGGGSKPRPKNPPKKPKDDYHFEVFNFVPCSICGNNQLCKSICKT,16466.04,436,1,326.0
56504_40mer_Cterm_GGGGSKDDYHFEVFNFVPCSICGNNQLCKSICKTIPSNKPKKKPT,17469.33,437,1,507.0


In [109]:
len("KPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLG")

40

In [228]:
"".join(random.choice(AMINOACIDS) for i in range(40))

'AETKCWATINHTKQKGWWTLGVEGMGETSKAAHSHEVKPK'

# Annotations

In [49]:
annoall = pd.read_csv(anno_file, sep="\t")

In [50]:
annoall.index = annoall["tile_id"]

In [51]:
annoall["library"] = [el[0] for el in annoall['tile_id'].str.split("_")]

In [52]:
annoall["AA"] = ""
annoall["oligo"] = ""

In [53]:
identifier = ""
with open(dna_file_noadap, "r") as wa:
    for line in wa:
        if line.startswith(">"):
            identifier = line.strip()[1:]
        else:
            annoall.loc[identifier, "oligo"] = line.strip()

In [54]:
identifier = ""
with open(prot_file, "r") as wa:
    for line in wa:
        if line.startswith(">"):
            identifier = line.strip()[1:]
        else:
            annoall.loc[identifier, "AA"] = line.strip()

In [55]:
annoall

Unnamed: 0_level_0,tile_id,allinfo,library,AA,oligo
tile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dolphin_2868950657743942533,dolphin_2868950657743942533,dolphin_['REP_ivig_1427_30_ALL_ivig_1427_30_uv...,dolphin,GLGWVQRDRRLASQKGGGGSQYRDDVIRNARLEWGGGGGSADFAAQ...,GGTCTGGGCTGGGTACAGCGCGACCGTAGACTGGCAAGTCAGAAAG...
dolphin_259235529426814309,dolphin_259235529426814309,dolphin_['REP_ivig_1427_30_ALL_ivig_1427_30_uv...,dolphin,ASWRENRHYPQRILRGGGGSIRALVSYDRWLWQRVGGGGSTRWFGH...,GCTTCATGGCGGGAGAATCGTCATTACCCCCAGCGTATTCTACGCG...
dolphin_2234732107848788089,dolphin_2234732107848788089,dolphin_['REP_uvig_356033_19_ALL_uvig_355478_4...,dolphin,DEIIKGKWGSNPERRGGGGSTAKYGAAAYEAAQARGGGGSVNEIMG...,GACGAGATCATCAAGGGCAAATGGGGTAGTAACCCGGAACGCCGTG...
dolphin_3865831791141569146,dolphin_3865831791141569146,dolphin_['REP_uvig_355947_23_ALL_uvig_355854_2...,dolphin,KNSENKYFLKVYKYIGGGGSRAFVQKFDADAQNAYGGGGSSQNLLS...,AAAAATTCAGAAAACAAATACTTCCTGAAAGTGTATAAATACATTG...
dolphin_8225991425082145501,dolphin_8225991425082145501,dolphin_['REP_uvig_357441_78_ALL_uvig_357441_7...,dolphin,REIMERRHLDMEDWKGGGGSQKHLAQKMARKKFWKGGGGSEQIYRA...,CGTGAGATTATGGAACGCCGCCACCTGGACATGGAAGATTGGAAGG...
...,...,...,...,...,...
monkeyvir_5705363590145353140,monkeyvir_5705363590145353140,monkeyvir_['tr|Q7TFM3|Q7TFM3_RHCM6_Rh111_OS=Rh...,monkeyvir,TSFEYHQGKGVPIVQDSSEDDSSSSSESEDMDVFEVASTTPHAGTS...,ACATCATTCGAATATCATCAGGGTAAAGGAGTGCCGATTGTTCAGG...
monkeyvir_4175180770374820368,monkeyvir_4175180770374820368,monkeyvir_['tr|Q7TFM3|Q7TFM3_RHCM6_Rh111_OS=Rh...,monkeyvir,SLHFFDRNLFFTYKKTDHSIFRDQYRITTSFEYHQGKGVPIVQDSS...,TCACTCCATTTTTTTGATCGCAACTTGTTTTTTACCTATAAAAAGA...
monkeyvir_2199189252886226186,monkeyvir_2199189252886226186,monkeyvir_['tr|Q7TFM3|Q7TFM3_RHCM6_Rh111_OS=Rh...,monkeyvir,PIMSTQCIFMEIRSLHDSVYIEPFQAIASLHFFDRNLFFTYKKTDH...,CCTATAATGTCCACGCAGTGTATTTTCATGGAGATTAGGAGTCTGC...
monkeyvir_3432625138740377883,monkeyvir_3432625138740377883,monkeyvir_['tr|Q7TFM3|Q7TFM3_RHCM6_Rh111_OS=Rh...,monkeyvir,MIDTYFESDKYIGFICPKSVPGCSISCNPIMSTQCIFMEIRSLHDS...,ATGATAGACACGTATTTCGAATCAGATAAATATATTGGTTTTATTT...


In [56]:
annoall.to_csv("phageome_tiles_annotation_all.csv")