In [3]:
%matplotlib inline

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from scipy import optimize

import copy

SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 18

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


In [5]:
aa_string = "ACDEFGHIKLMNPQRSTVWY"
aa_dict = dict([(a,0) for a in aa_string])

all_aa = []
for i in range(12):
    all_aa.append(copy.deepcopy(aa_dict))
    
# Read the untreated library counts for each amino acid
count_list = []
with open("../fig_s2/untreated-library.counts") as f:
    for line in f:
        if line.startswith("#") or line.strip() == "":
            continue

        seq = line.split()[0]
        count = int(line.split()[1])
        
        for i, aa in enumerate(seq):
            all_aa[i][aa] += count

# Record counts as array
out_seqs = []
data = np.zeros((20,12),dtype=np.float)
for i in range(12):
    for j in range(20):
        data[j,i] = all_aa[i][aa_string[j]]
        

# Normalize so each column sums to 1.0 
data = data/np.sum(data,axis=0)
aa_to_grab = np.array(list(aa_string))

# Generate random sequences sampling from this library
gen_seq = []
for i in range(12):
    gen_seq.append(np.random.choice(aa_to_grab,size=10000,p=data[:,i]))

f = open("seq-for-logo.fasta","w")
gen_seq = np.array(gen_seq)
for i in range((gen_seq.shape[1])):
    seq_as_string = "".join(gen_seq[:,i])
    f.write(f">{seq_as_string}\n{seq_as_string}\n")
f.close()
