In [1]:
%matplotlib inline

import hops_enrich

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import glob


### Generate clusters of all sequences seen in the hA5, hA6, ancA5/A6, and altAll samples

In [None]:

# Go through all-pooled counts files
for all_seq_file in glob.glob("*_all_pooled.counts"):

    # Get list of all sequences seen in these files
    seq_list = []
    with open(all_seq_file) as f:
        for line in f:
            seq_list.append(line.split()[0].strip())
    
    
    # Generate clusters
    cluster_file = f"{all_seq_file.split('.counts')[0]}.cluster"
    hops_enrich.cluster.cluster_seqs(seq_list,
                                     epsilon=1,
                                     min_neighbors=1,
                                     dist_function="simple",
                                     out_file=cluster_file)

In [None]:

def get_enrichment(conv_file,comp_file,cluster_file,out_file=None,min_counts=6):

    conv = hops_enrich.enrich.load_counts_file(conv_file)
    comp = hops_enrich.enrich.load_counts_file(comp_file)
    clusters, _ = hops_enrich.cluster.read_cluster_file(cluster_file)
    enrichment, _, _, gaussian_stats = hops_enrich.enrich.calc_enrichment(conv,comp,clusters,min_counts=min_counts,out_file=out_file)
    
    unresponsive_mean = np.max(gaussian_stats[0])
    
    for k in enrichment:
        enrichment[k] = enrichment[k] - unresponsive_mean
        
    return enrichment


enrichment = {}
for sample in ["hA5","hA6","aA5A6","alt"]:
    for x in ["1","2","pooled"]:
    
        print(sample,x)
    
        key = f"{sample}_{x}"
    
        enrichment[key] = get_enrichment(f"../enrichment_files/{sample}_conv_{x}.counts",
                                         f"../enrichment_files/{sample}_comp_{x}.counts",
                                         f"../enrichment_files/{sample}_all_pooled.cluster",
                                         out_file=f"{sample}_{x}.enrich")




In [None]:

def build_df(enrich,names):
    
    sets = []
    for e in enrich:
        sets.append(set(e.keys()))
    
    shared_seq = sets[0]
    for s in sets[1:]:
        shared_seq = shared_seq.intersection(s)
        
    out_dict = {"seq":[]}
    for n in names:
        out_dict[n] = []
    for seq in shared_seq:
        out_dict["seq"].append(seq)
        for i, e in enumerate(enrich):
            out_dict[names[i]].append(e[seq])
        
    return pd.DataFrame(out_dict)

aA5A6_df = build_df([enrichment["hA5_pooled"],enrichment["hA6_pooled"],enrichment["aA5A6_pooled"]],
                    ["hA5","hA6","aA5A6"])
aA5A6_df.to_csv("hA5-hA6-aA5A6_enrichment.txt")

alt_df = build_df([enrichment["hA5_pooled"],enrichment["hA6_pooled"],enrichment["alt_pooled"]],
                    ["hA5","hA6","alt"])
alt_df.to_csv("hA5-hA6-alt_enrichment.txt")


In [None]:
anisotropy_peptides = {"phage_ctl_0":"EGLDLMSILELI",
                       "phage_ctl_1":"RHGFLQDILFKL",
                       "phage_ctl_2":"GWLEQYFSRTAD",
                       "phage_ctl_4":"SRQTTSTHEWVV",
                       "phage_ctl_5":"EQPLLKYLQLMR",
                       "phage_ctl_6":"HVQWRDRNVIEW",
                       "phage_ctl_7":"GEVTNYGYLVDQ",
                       "phage_ctl_8":"SSSTYPGFRQST",
                       "phage_ctl_9":"SGPSDWLHKGVL"}

mask = [s in anisotropy_peptides.values() for s in aA5A6_df.seq]
aA5A6_df[mask]

In [None]:
all_df = build_df([enrichment["hA5_pooled"],enrichment["hA6_pooled"],enrichment["aA5A6_pooled"],enrichment["alt_pooled"]],
                    ["hA5","hA6","aA5A6","alt"])
all_df.to_csv("hA5-hA6-aA5A6-alt_enrichment.txt")