# Simulate Sequences

In [None]:
### IMPORTS
from simulate import run_simulation

### Run Single Simulation
Commented out, uncomment to use.

In [None]:
# MONOMERS USED - leave as default if theoretical
#mon_names = ['mon1', 'mon2', 'mon3', 'mon4', 'mon5']
mon_names = ['MMA', 'EHMA', 'OEGMA500', 'SPMA', 'NHSMA', 'TMAMA']

### SET MOLECULAR WEIGHTS OF MONOMERS IN ORDER OF LABEL (labels are 1 indexed)
#Mw = [100.121,500,198.3,246.32, 104.15]
Mw = [100.121, 198.3, 500, 246.32, 183.16, 207.70] #MMA, EHMA, OEGMA500, SPMA, NHSMA, TMAMA
#HLBs = [8.45, 11.42, 5.125, 18.5, 4.865]
HLBs = [8.45, 5.125, 11.42, 18.5, 10, 10] #MMA, EHMA, OEGMA500, SPMA, placeholder, placeholder

# Molar ratios of each monomer
# ex. if N_MONs = 4: [0.5, 0.25, 0.20, 0.05]
MRs = [46,19,25,0,5,5]

# Number of unique monomers - auatomatically calculated from MRs above
N_MONs = len(MRs)

# What model of copolymerization to use? (1) mayolewis (terminal) or (2) penultimate
# NOTE FOR NOW ONLY MAYO-LEWIS IMPLEMENTED.
#model = "mayolewis"

# reactivity ratios of monomers, ex. for 4 monomers:
#[r12, r13, r14]
#[r21, r23, r24]
#[r31, r32, r34]
#[r41, r42, r43]
# FULL TABLE BELOW (MMA, EHMA, OEGMA, NHSMA, SPMA, TMAMA)
#RRs = ([[0.9, 1.3, 0.5, 1.1, 0.6], # MMA
#        [0.4, 1.3, 0.6, 1.0, 1.0], # EHMA
#        [0.7, 0.4, 0.6, 0.7, 0.8], # OEGMA
#        [0.7, 1.6, 2.0, 0.6, 0.7], # NHSMA
#        [1.2, 1.0, 1.5, 0.9, ?], # SPMA
#        [1.3, 1.7, 2.0, 0.7, ?] # TMAMA
#      ])
RRs = ([[0.9, 1.3, 1.1, 0.5, 0.6], # MMA
        [0.4, 1.3, 1.0, 0.6, 1.0], # EHMA
        [0.7, 0.4, 0.7, 0.6, 0.8], # OEGMA
        [1.2, 1.0, 1.5, 0.9, 0.0], # SPMA - note SPMA-TMAMA unknown, 0 placeholder.
        [0.7, 1.6, 2.0, 0.6, 0.7], # NHSMA
        [1.3, 1.7, 2.0, 0.0, 0.7], # TMAMA - note TMAMA - SPMA unknown, 0 placeholder.
      ])

# % conversion targetted (0-1), i.e. how much of the monomer pool do you want to use
conv = 0.675

# average degree of polymerization (chain length) you are targetting at YOUR conversion, NOT at 100%.
avgDP = 225

# number of polymer chains to simulate
N_CHAINs = 100000

# Chain transfer % (0-1)
# TODO: replace this with direct PDI control
CTP = 1

# cutoff DP of chains considered as polymers not oligomers that get "purified" out
# set to 0 if you don't want to do any filtration
PRUNE_OLIGOMERS = 0

In [None]:
# UNCOMMENT TO USE THIS CODE:
run_simulation(N_MONs, N_CHAINs, MRs, RRs, avgDP, conv, CTP, PRUNE_OLIGOMERS)

### Run Multiple Simulations

Change the array you want to modify, then also change the array you are looping over in the loop below - be sure to also change what is actually plugged into the equation! 

Sources for Styrene RRs:
- https://pubs.acs.org/doi/10.1021/acs.macromol.8b01526 (MMA)

- https://link.springer.com/article/10.1007/s13233-011-1207-z (OEGMA, assume same for EHMA and SPMA)

In [None]:
# MONOMERS USED - leave as default if theoretical
mon_names = ['mon1', 'mon2', 'mon3', 'mon4', 'mon5']

# Number of unique monomers
N_MONs = 4
#N_MONs = [2,3,4,5]

# Molar ratios of each monomer
# ex. if N_MONs = 4: [0.5, 0.25, 0.20, 0.05]
#MRs = [[70,30,0,0,0], [51,27,22,0,0], [50,25,20,5,0], [45,20,15,5,15]]
# MRs = [[10, 65, 20, 5],[20, 55, 20, 5],[30, 45, 20, 5],[40, 35, 20, 5],[50, 25, 20, 5],[60, 15, 20, 5],[70, 5, 20, 5]]
#MRs = [[10,25,60,5],[20,25,50,5],[30,25,40,5],[40,25,30,5],[50, 25, 20, 5],[60,25,10,5]]
MRs = [[12,44,19,25], [42,38,11,9], [47,22,21,10],[26,23,41,10]]

# reactivity ratios of monomers, ex. for 5 monomers:
#[r12, r13, r14, r15]
#[r21, r23, r24, r25]
#[r31, r32, r34, r35]
#[r41, r42, r43, r45]
#[r51, r52, r53, r54]
RRs = ([[1, 1, 1, 0.491],
       [1, 1.09, 1.09, 0.53],
       [1, 1.09, 1.09, 0.53],
       [1, 1.09, 1.09, 0.53], 
       [0.697, 0.53, 0.53, 0.53]])

# % conversion targetted (0-1), i.e. how much of the monomer pool do you want to use
conv = 0.5

# average degree of polymerization (chain length) you are targetting at YOUR conversion, NOT at 100%.
avgDPs = [50, 100]
#avgDP = 100

# number of polymer chains to simulate
N_CHAINs = 15000
#N_CHAINs = [1000, 3000, 5000, 8000, 10000, 15000, 20000, 30000, 40000, 50000, 75000, 100000]

# Chain transfer % (0-1)
# TODO: replace this with direct PDI control
CTP = 0.15

# cutoff DP of chains considered as polymers not oligomers that get "purified" out
# set to 0 if you don't want to do any filtration
PRUNE_OLIGOMERS = 15

In [None]:
# UNCOMMENT TO VARY N_CHAINs
# for n in N_CHAINs:
#   run_simulation(N_MONs, n, MR, RRs, avgDP, conv, CTP, PRUNE_OLIGOMERS)

# UNCOMMENT TO VARY MRs
# for m in MRs:
#     run_simulation(N_MONs, N_CHAINs, m, RRs, avgDP, conv, CTP, PRUNE_OLIGOMERS)

# UNCOMMENT TO VARY N_MONs
# for n,m in zip(N_MONs, MRs):
#        run_simulation(n, N_CHAINs, m, RRs, avgDP, conv, CTP, PRUNE_OLIGOMERS)

# UNCOMMENT TO VARY DPs:
for d in avgDPs:
       run_simulation(N_MONs, N_CHAINs, MR, RRs, d, conv, CTP, PRUNE_OLIGOMERS)

# Seq Analysis

In [None]:
#importing packages for sequence analysis
import pandas as pd
import numpy as np
import glob, os
import csv
import math
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
import copy
import csv
from tqdm import tqdm_notebook
from itertools import tee
import re
from scipy.optimize import curve_fit
from scipy.interpolate import interp1d

# this is important to allow us to save figs as editable .pdf
# 42 is some magic number encoded into matplotlib for True Font type
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
new_rc_params = {'text.usetex': False, "svg.fonttype": 'none'}
matplotlib.rcParams.update(new_rc_params)

XUGROUP_IGOR_MACRO_SIZE = 22

plt.rc('font', family='Helvetica', size=XUGROUP_IGOR_MACRO_SIZE)  # controls default text sizes
plt.rc('axes', titlesize=XUGROUP_IGOR_MACRO_SIZE)  # fontsize of the axes title
plt.rc('axes', labelsize=XUGROUP_IGOR_MACRO_SIZE)  # fontsize of the x and y labels
plt.rc('xtick', labelsize=XUGROUP_IGOR_MACRO_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=XUGROUP_IGOR_MACRO_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=XUGROUP_IGOR_MACRO_SIZE)    # legend fontsize
plt.rc('figure', titlesize=XUGROUP_IGOR_MACRO_SIZE)  # fontsize of the figure title
plt.rc('lines', markersize=10)
plt.rc('lines', linewidth=2) # thicker lines to match Xu Group Igor Macros

### Load sequence CSV files and set monomer properties
Specify path and what monomers each label corresponds to here (by specifying Mw and HLB)

In [None]:
### IMPORT ALL SEQUENCE CSV FILES IN FOLDER SPECIFIED BY PATH

path = "outputs"
csv_files = []
for file in os.listdir(path):
    if file.endswith(".csv"):
        csv_files.append(os.path.join(path, file))
csv_files = sorted(csv_files)
print("Number of sequence files:", len(csv_files))
print(csv_files)

### Calculate general sequence statistics and average properties for all sequences by batch
Output is a dataframe with column headers: [Batch Name	NC	Avg Mn	Avg Mw	PDI	Avg DP]

In [None]:
### CALCULATE AVERAGE DP, Mn, Mw & PDI OF ALL SEQUENCES

names = []
num_seqs = []
avg_Mns = []
avg_Mws = []
PDIs = []
avg_DPs = []
seqs = []

for m in tqdm_notebook(range(len(csv_files))):
    seq = []

    with open(csv_files[m], mode = 'r') as file:
        csvFile = csv.reader(file)
        i = 0
        for lines in csvFile:
            i = i + 1
            seq.append(lines)
    raw_seq = copy.deepcopy(seq)
    seqs.append(raw_seq)
    seqlen = np.zeros(i)
    seqweight = np.zeros(i)
    seqweight2 = np.zeros(i)

    for j in range(0,i):
        seqlen[j] = len(seq[j])
        for k in range(len(seq[j])):
            seq[j][k] = Mw[int(seq[j][k])-1]
        seqweight[j] = sum(seq[j])
        seqweight2[j] = (sum(seq[j])**2)
    
    # calculating molecular weight of chain (number-average or weight-average),
    # degree of polymerization (DP), and polydispersity index (PDI)
    AvgMw = sum(seqweight2)/sum(seqweight)
    AvgMn = sum(seqweight)/len(seqweight)
    PDI = AvgMw/AvgMn
    DP = sum(seqlen)/len(seqlen)
    
    # formatting general sequence statistics & average properties calculated as csv file
    names.append(csv_files[m].replace(path, '').replace('/', '').replace('.csv', ''))
    n_mons = []
    MR1s = []
    MR2s = []
    MR3s = []
    MR4s = []
    MR5s = []
    MR6s = []
    for name in names:
        try:
            # n_mons, MR1, ... MRN, n_chains, DP, conv, CTP, Filt
            num_vals = re.findall(r'\d+', name)
            n_mons.append(int(num_vals[0]))
            MR1s.append(int(num_vals[1]))
            MR2s.append(int(num_vals[2]))
            if len(num_vals) > 8:
                MR3s.append(int(num_vals[3]))
            else:
                MR3s.append(0)
            if len(num_vals) > 9:
                MR4s.append(int(num_vals[4]))
            else:
                MR4s.append(0)
            if len(num_vals) > 10:
                MR5s.append(int(num_vals[5]))
            else:
                MR5s.append(0)
            if len(num_vals) > 11:
                MR6s.append(int(num_vals[6]))
            else:
                MR6s.append(0)
        except IndexError as err:
            n_mons.append(4)
            MR1s.append(0)
            MR2s.append(0)
            MR3s.append(0)
            MR4s.append(0)
            MR5s.append(0)
            MR6s.append(0)
    num_seqs.append(i)
    avg_Mns.append(AvgMn)
    avg_Mws.append(AvgMw)
    PDIs.append(PDI)
    avg_DPs.append(DP)
    
d = {'Batch Name': names, 'Mons': n_mons, mon_names[0]: MR1s, mon_names[1]: MR2s, mon_names[2]: MR3s, mon_names[3]: MR4s, mon_names[4]: MR5s, mon_names[5]: MR6s, 'NC': num_seqs, 'Avg Mn': avg_Mns, 'Avg Mw': avg_Mws, 'PDI': PDIs, 'Avg DP': avg_DPs}
df_prop = pd.DataFrame(data=d)
df_prop

### Read csv output files of 'n' batches and convert to n-element pd dataframe

In [None]:
#Read csv output files of 'n' batches and convert to n-element pd dataframe
dfs = []
seq_lens = []

for m in range(len(csv_files)):
    seq = []
    row_size = [] 
    
    with open(csv_files[m], mode = 'r') as file:
        csvFile = csv.reader(file)
        i = 0
        for lines in csvFile:
            i = i + 1
            seq.append(lines)
            row_size.append(len(lines))
        df = pd.DataFrame(seq)
        dfs.append(df)
        seq_lens.append(row_size)

dfs[0].head()

### Compdrift analysis
Synthesis plots such as compositional drift, monomer consumption etc.

In [None]:
# plot monomer consumption compared to feeding ratios

for i, df in tqdm_notebook(enumerate(dfs), total=len(dfs)):
    
    # create a dictionary to store counts
    counts = {}
    for mon in range(1,N_MONs+1):
        counts[mon] = 0
    total_counts = 0

    for col in tqdm_notebook(df, total=len(df.columns)):
        # convert the dataframe from strings to integers
        df = df.apply(pd.to_numeric, errors='coerce')

        # count the number of non-zero entries in the column
        col_counts = df[col][df[col] > 0].value_counts()

        # add the counts to the dictionary
        for num, count in col_counts.items():
            counts[num] += count
            total_counts += count

    # sort the dictionary by keys in ascending order
    counts = dict(sorted(counts.items()))

    # calculate the percentage of each number in the dataframe
    percentages = {}
    for num, count in counts.items():
        percentages[num] = (count / total_counts) * 100

    # print the percentages
    print(df_prop.iloc[i]['Batch Name'])
    for num, percentage in percentages.items():
        print(f"{mon_names[num-1]}: {percentage:.2f}%")

### Composition Heterogeneity
- mon2mon: visualize sequences simulated (TODO: make this prettier - get code from adv. attack notebook)
- seq2seq: Plot histogram with KDE fit of composition variance per chain around feeding fraction
- batch2batch: nFWHM to compare histograms between batches (normalized full width half maximum: (FWHM = 2*sqrt(2*ln(2))* stdev ~= 2.355*stdev ) & FWHM normalized by feeding fraction)

In [None]:
### VISUALIZE SEQUENCE SIMULATED
import matplotlib.colors as colors

# Do you want thin white line between each monomer?
# White line (True) helps you count # monomers in a continuous segment better
# No white line (False) is just prettier, easier to look at
WHITE_LINE = False 

# pick the batch & sequences to plot
batch_no = 2
seq_no = 0
num_seqs = 50

chains = []
for i in range(num_seqs):
    chains.append([int(x) if x != None else 0 for x in dfs[batch_no].iloc[seq_no + i]])

plt.cla()   # Clear axis
plt.clf()   # Clear figure
plt.figure(num=None, figsize=(30, 8), dpi=80, facecolor='w', edgecolor='k')

# This CMAP is MMA, EHMA, OEGMA500, SPMA, Mon5 - as per RHPapp publication.
CMAP = ['white', '#ba6fba', '#d24f38', '#3b3796', '#54aabc', '#ff7bac']
# This CMAP is MMA, EHMA, OEGMA500, SPMA, NHSMA and TMAMA - as per Hilburg et al.
CMAP = ['white', '#555454', '#d74d2f', '#5a7ee6', '#e3cb45', '#3aad97', '#7d57b8']

if WHITE_LINE:
    ax = sns.heatmap(chains, vmin=0, vmax=df_prop.iloc[batch_no]['Mons'], linewidth=0.1, xticklabels=False, yticklabels=False, square=True, cmap=CMAP[:(N_MONs+1)])
else:
    ax = sns.heatmap(chains, vmin=0, vmax=df_prop.iloc[batch_no]['Mons'], linewidth=0, xticklabels=False, yticklabels=False, square=True, cmap=CMAP[:(N_MONs+1)])
cb = ax.collections[0].colorbar
cb.set_ticks(np.arange(1,N_MONs+1))
cb.set_ticklabels(mon_names[:N_MONs+1])

plt.show()
plt.cla()   # Clear axis
plt.clf()   # Clear figure


In [None]:
### Histograms of composition on each individual monomer chain:


###### PARAMS TO VARY ######
RUN_ONLY_SUBSET = False # set False if you want to run all sequences
USE_TITLE = False # Set true if you want to add a plot label i.e. for labeling which plot is which
PLOT_Y_AXIS_TOP_LIM = 20 # set the y-axis height
############################

FWHMs = []
FWHM_norms = []

# loop across each batch of polymers
for m in range(len(csv_files)):
    N_MONS = df_prop['Mons'][m] # number of unique monomers
    seq = []
    
    if RUN_ONLY_SUBSET:
        if m not in subset_inds:
            continue
    
    with open(csv_files[m], mode = 'r') as file:
        csvFile = csv.reader(file)
        num_seqs = 0
        for lines in csvFile:
            num_seqs = num_seqs + 1
            seq.append(lines)

    seqlen = np.zeros(num_seqs)
    seq_comp = np.zeros([num_seqs,N_MONS])

    # this loop counts the composition distribution of each chain as a fraction
    for j in range(0,num_seqs):
        seqlen[j] = len(seq[j])
        for k in range(N_MONS):
            seq_comp[j][k] = seq[j].count(str(k+1))/seqlen[j]
    
    # Calculating full width half maximum (FWHM)
    print(csv_files[m].replace(path, '').replace('/', '').replace('.csv', ''))
    FWHM = np.zeros(5)
    FWHM_norm = np.zeros(5)
    for i in range(N_MONS):
        FWHM[i] = 2.355*np.std(seq_comp[:,i])
        FWHM_norm[i] = FWHM[i]/np.mean(seq_comp[:,i])
    
    FWHMs.append(FWHM)
    FWHM_norms.append(FWHM_norm)
    
    # Composition Histogram plotting: figure setup
    fig = plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
    ax = fig.add_subplot(111)
        
    # Composition Histogram plotting: main code
    sns.set_style(style='white')
    plt.xlabel("Feeding monomer fraction")
    plt.ylabel("Density")
    if USE_TITLE:
        plt.title(str(df_prop['NC'][m]))
    sns.kdeplot(seq_comp[:,0], label="MMA")
    sns.kdeplot(seq_comp[:,1], label="OEGMA")
    if N_MONS > 2:
        sns.kdeplot(seq_comp[:,2], label="EHMA")
    if N_MONS > 3:
        sns.kdeplot(seq_comp[:,3], label="SPMA")
    if N_MONS == 5:
        sns.kdeplot(seq_comp[:,4], label="Styrene")
    ax.spines["top"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["right"].set_linewidth(1.5)
    ax.tick_params('both', length=10, width=1.5, which='major')
    plt.xlim(0,1)
    plt.ylim(0,PLOT_Y_AXIS_TOP_LIM)
    plt.legend(frameon=False)
    plt.tight_layout()
    # uncomment line below to save figures:
    plt.savefig('Figures/' + str(df_prop['Batch Name'][m] + '.svg'), format='svg', bbox_inches='tight')
    plt.show()
    

    plt.cla()   # Clear axis
    plt.clf()   # Clear figure


FWHMs = np.array(FWHMs)
FWHM_norms = np.array(FWHM_norms)

In [None]:
### Normalized-FWHM plots to compare histograms between batches

###### PARAMS TO VARY ######
N_MONS = 4 # number of monomers, from 2-5
PARAM_2_VARY = "Batch Name" # i.e., 'Batch Name', 'NC', 'Avg Mn'
DISCRETE = True # note: 2 settings, discrete vs continuous to use for diff. data types
USE_TITLE = True # Set true if you want to add a plot label i.e. for labeling which plot is which
############################

for i in range(5):
    df_prop['nFWHM_' + str(i)] = FWHM_norms[:,i]

plt.cla()   # Clear axis
plt.clf()   # Clear figure
fig, ax = plt.subplots(1, 1, figsize=(10,10))
sns.set_style(style='white') #style='white' or 'darkgrid'
ax = sns.regplot(x=PARAM_2_VARY, y="nFWHM_0", data=df_prop, fit_reg=DISCRETE, label='MMA')
ax = sns.regplot(x=PARAM_2_VARY, y="nFWHM_1", data=df_prop, fit_reg=DISCRETE, label='OEGMA')
if N_MONS > 2:
    ax = sns.regplot(x=PARAM_2_VARY, y="nFWHM_2", data=df_prop, fit_reg=DISCRETE, label='EHMA') 
if N_MONS > 3:
    ax = sns.regplot(x=PARAM_2_VARY, y="nFWHM_3", data=df_prop, fit_reg=DISCRETE, label='SPMA')
if N_MONS > 5:
    ax = sns.regplot(x=PARAM_2_VARY, y="nFWHM_4", data=df_prop, fit_reg=DISCRETE, label='Styrene')

ax.set_ylabel('nFWHM')
if USE_TITLE:
    ax.set_title('DP300')
ax.spines["top"].set_linewidth(1.5)
ax.spines["bottom"].set_linewidth(1.5)
ax.spines["left"].set_linewidth(1.5)
ax.spines["right"].set_linewidth(1.5)
ax.tick_params('both', length=10, width=1.5, which='major')
plt.legend(loc='best', frameon=False)
plt.savefig('Figures/FWHM_AvgDP_DP300.svg', format='svg')
plt.show()


### Hydrophilic & Hydrophobic Segments
- mon2mon: Visualize hydrophobic & hydrophilic segments along a chain?
- seq2seq: box-whisker of avg'd per chain proerties (NOTE: we don't actually need the batch summed statistics (histogram) for this level - its only good to compare across levels)
- batch2batch: lineplot of summed histograms (summed statistics) with 1 line for each batch.

In [None]:
# HLB_CUTOFF: determines what is considered hydrophobic and what is considered hydrophilic
# Anything less than or equal to HLB_CUTOFF is considered hydrophobic.
HLB_CUTOFF = 9

In [None]:
### VISUALIZE HYDROPHLIC & HYDROPHOBIC SEGMENTS
# go through each sequence specified and BINARIZE based on HLB_CUTOFF, then visualize.

## mon2mon level: visualize hydrophobic & hydrophilic segments along a chain

###### PARAMS TO VARY ######

# input parameters for specific chain
batch_ind = 4
seq_no = 0
num_seqs = 10

############################


# binarizing code
cols_size = len(dfs[batch_ind].columns)
binarized_chains = []
for i in range(num_seqs):
    binarized_seq = []
    for j in range(cols_size):
        this_mon = dfs[batch_ind][j][i]
        if this_mon is None:
            binarized_seq.append(2)
        elif HLBs[int(this_mon)-1] <= HLB_CUTOFF:
            binarized_seq.append(0)
        else:
            binarized_seq.append(1)
    binarized_chains.append(binarized_seq)

plt.cla()   # Clear axis
plt.clf()   # Clear figure
plt.figure(num=None, figsize=(55, 5), dpi=80, facecolor='w', edgecolor='k')
ax = sns.heatmap(binarized_chains, vmin=0, vmax=2, linewidth=0.2, xticklabels=False, yticklabels=False)
plt.savefig('Figures/ProK_binarized.pdf', format='pdf', bbox_inches='tight')
plt.show()
plt.cla()   # Clear axis
plt.clf()   # Clear figure

In [None]:
## SEQ2SEQ HYDROPHOBIC AND HYDROPHILIC SEGMENTS (averaged per chain properties)

###### PARAMS TO VARY ######

# input parameters for specific batch for seq2seq analysis

b_no = 5 # select batch to analyze

seg_len_2_plot = [1,2,3,5,7] # segment lengths to plot

MAX_SEG_LEN = 60 # guess at upper bounded length for hydrophobic segment (magic number to be replaced)

USE_TITLE = True # Set true if you want to add a plot label i.e. for labeling which plot is which

############################

rows_size = len(dfs[b_no])
cols_size = len(dfs[b_no].columns)

# initializing lists to record hydrophobic and hydrophilic segments.
# hydrophob_li & hydrophil_li are lists of arrays each list represents a sequence.
# each array represents the counts of segments of length of it's position index.
# i.e. [0,5,4,3,1,0] --> 0 segments of length 0, 5 of length 1, 3 of length 3 etc.
hydrophob_li = list()
hydrophil_li = list()

for i in tqdm_notebook(range(rows_size)):
    hydrophobl = np.zeros(cols_size)
    hydrophilbl = np.zeros(cols_size)
    counterpho = 0
    counterphil = 0
    for j in range(cols_size):
        if dfs[b_no][j][i] == None:
            if counterpho > 0:
                hydrophobl[counterpho-1] += 1
            if counterphil > 0:
                hydrophilbl[counterphil-1] += 1
            break
        elif HLBs[int(dfs[b_no][j][i])-1] <= HLB_CUTOFF:
            counterpho += 1
            if counterphil > 0:
                hydrophilbl[counterphil-1] += 1
            counterphil = 0
            continue
        else:
            counterphil += 1
            if counterpho > 0:
                hydrophobl[counterpho-1] += 1
            counterpho = 0
            continue

    hydrophob_li.append(hydrophobl)
    hydrophil_li.append(hydrophilbl)
    
## IF THROWS ERROR CHANGE RANGE OF PHOB AND PHIL A-D

phob_or_phil = ['HPHOB', 'HPHIL']

for k in phob_or_phil:

    # Hydrophobic Segments per chain Counting Code.
    # sum each sequence's array into 1 sequence array for -phobic 
    # these arrays 
    summed_seg_length_counts = []
    for j in range(len(seg_len_2_plot)):
        summed_seg_length_counts.append(np.zeros(MAX_SEG_LEN)) # NOTE MAX_SEG_LEN IS A MAGIC NUMBER THAT NEEDS TO BE REPLACED

    # loop through each array represented sequence
    # sum the segment lengths counts - so you have sum across all chains
    if k == 'HPHOB':
        for i in range(len(hydrophob_li)):
            for j in range(len(seg_len_2_plot)):
                if hydrophob_li[i][seg_len_2_plot[j]] != 0:
                    summed_seg_length_counts[j][int(hydrophob_li[i][seg_len_2_plot[j]])] += 1
    else:
        for i in range(len(hydrophil_li)):
            for j in range(len(seg_len_2_plot)):
                if hydrophil_li[i][seg_len_2_plot[j]] != 0:
                    summed_seg_length_counts[j][int(hydrophil_li[i][seg_len_2_plot[j]])] += 1

    # remove the 0 index
    trunc_sum_seg_len_counts = []
    for j in range(len(seg_len_2_plot)):
        trunc_sum_seg_len_counts.append(summed_seg_length_counts[j][1:])

    # Hydrophobic Segments per chain Plotting Code
    # NOTE WE ARE DIVIDING BY NUM_SEQS --> TO GET AN AVERAGE PER CHAIN.
    fig = plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
    ax = fig.add_subplot(111)
    if k == 'HPHOB':
        plt.xlabel("Counts of hydrophobic segments of length X")
    else:
        plt.xlabel("Counts of hydrophilic segments of length X")
    plt.ylabel("Average frequency per chain")
    sns.set_style(style = 'white')

    for j in range(len(seg_len_2_plot)):
        x_data = np.linspace(1, len(trunc_sum_seg_len_counts[j]), len(trunc_sum_seg_len_counts[j]))
        y_data = trunc_sum_seg_len_counts[j]/df_prop['NC'][b_no]
        label = str(seg_len_2_plot[j])
        sns.scatterplot(x_data,y_data,label=label)

        cubic_interploation_model = interp1d(x_data, y_data, kind = "cubic")

        #popt, pcov = curve_fit(func, x_data, y_data)
        sns.lineplot(np.linspace(x_data.min(), x_data.max(), 300), cubic_interploation_model(np.linspace(x_data.min(), x_data.max(), 300)))

    ax.spines["top"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["right"].set_linewidth(1.5)
    ax.tick_params('both', length=10, width=1.5, which='major')
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    #ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    plt.xlim(1,25)
    plt.ylim(0,0.4)
    plt.legend(title='X = ', frameon=False)
    if USE_TITLE:
        plt.title("NC = " + str(df_prop['NC'][b_no]))
    if k == 'HPHOB':
        plt.savefig('Figures/seq2seq_hydrophobic_NChains_'+ str(df_prop.iloc[:,7][b_no]) + '_DP300.svg', format='svg')
    else:
        plt.savefig('Figures/seq2seq_hydrophilic_NChains_'+ str(df_prop.iloc[:,7][b_no]) + '_DP300.svg', format='svg')
    plt.show()
    plt.cla()
    plt.clf()


In [None]:
# BATCH SUMMED STATISTICS FOR HYDROPHILIC & HYDROPHOBIC SEGMENTS
x_phob_all = []
x_phil_all = []
graph_phob_all = []
graph_phil_all = []
for batch_ind in tqdm_notebook(range(len(csv_files))):

    rows_size = len(dfs[batch_ind])
    #print("rows size:", rows_size)
    cols_size = len(dfs[batch_ind].columns)
    #print("cols size:", cols_size)
    
    # count the hydrophilic and hydrophobic segments
    hydrophob_li = list()
    hydrophil_li = list()
    for i in tqdm_notebook(range(rows_size)):
        hydrophobl = np.zeros(cols_size)
        hydrophilbl = np.zeros(cols_size)
        counterpho = 0
        counterphil = 0
        for j in range(cols_size):
            if dfs[batch_ind][j][i] == None:
                if counterpho > 0:
                    hydrophobl[counterpho-1] += 1
                if counterphil > 0:
                    hydrophilbl[counterphil-1] += 1
                break
            elif HLBs[int(dfs[batch_ind][j][i])-1] <= HLB_CUTOFF:
                counterpho += 1
                if counterphil > 0:
                    hydrophilbl[counterphil-1] += 1
                counterphil = 0
                continue
            else:
                counterphil += 1
                if counterpho > 0:
                    hydrophobl[counterpho-1] += 1
                counterpho = 0
                continue

        hydrophob_li.append(hydrophobl)
        hydrophil_li.append(hydrophilbl)

    hydrophobic_df = pd.DataFrame(hydrophob_li)
    hydrophilic_df = pd.DataFrame(hydrophil_li)

    # Sum the segment counts
    graph1_phob = np.array(hydrophobic_df.sum(axis=0))
    graph1_phil = np.array(hydrophilic_df.sum(axis=0))

    # go backwards from both until you find the first non-zero entry and truncate there
    # +1 as well (so really +2) so that you don't end on a weird uptick.
    for i in range(1,len(graph1_phob)):
        if (graph1_phob[-i] != 0.0):
            graph1_phob = graph1_phob[:(-i+2)]
            break

    for i in range(1,len(graph1_phil)):
        if (graph1_phil[-i] != 0.0):
            graph1_phil = graph1_phil[:(-i+2)]
            break

            
    # create x-axis for both
    x_phob_1 = np.arange(1,len(graph1_phob)+1)
    x_phil_1 = np.arange(1,len(graph1_phil)+1)
    x_phob_all.append(x_phob_1)
    x_phil_all.append(x_phil_1)
    graph_phob_all.append(graph1_phob)
    graph_phil_all.append(graph1_phil)

In [None]:
# BATCH LEVEL PLOTTING CELL.

###### PARAMS TO VARY ######

# input parameters for batch-level analysis

PARAM_2_VARY = 'Mons' # name of whatever variable is you are trying to plot & analyze (i.e. legend labels)
LEGEND_TITLE = True # if you have numerical legend headers i.e. different DPs or NCs set this to True

############################

# for plotting smooth curves
def func(x, a, b, c):
    return a * np.exp(-b * x) + c

# LINE PLOTS ACROSS BATCHES: HYDROPHOBIC (PLOTTING CODE)
fig = plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
sns.set_style(style='white') #can set style='white' or 'darkgrid' depending on preference
plt.xlabel("Hydrophobic Segment Length")
plt.ylabel("Average frequency per chain")
#Comment following 2 lines to remove autolabelling
# WE ARE NORMANLIZING BY NUM_SEQS!
for i in range(len(csv_files)):
    x_data = x_phob_all[i]
    y_data = graph_phob_all[i]/df_prop['NC'][i]
    sns.scatterplot(x_data, y_data, label=str(df_prop[PARAM_2_VARY][i]))
    popt, pcov = curve_fit(func, x_data, y_data)
    sns.lineplot(np.linspace(x_data.min(), x_data.max(), 300), func(np.linspace(x_data.min(), x_data.max(), 300), *popt))
    #sns.lineplot(x_phob_all[i], graph_phob_all[i]/df_prop['NC'][i], label= str(int(df_prop['Avg DP'][i])))
## Uncomment the following lines to manually label
# sns.lineplot(x_phob_all[0], graph_phob_all[0]/df_prop['NC'][i], label="5 Monomers")
# sns.lineplot(x_phob_all[1], graph_phob_all[1]/df_prop['NC'][i], label="4 Monomers")
# sns.lineplot(x_phob_all[2], graph_phob_all[2]/df_prop['NC'][i], label="2 Monomers")
# sns.lineplot(x_phob_all[3], graph_phob_all[3]/df_prop['NC'][i], label="3 Monomers")
ax.spines["top"].set_linewidth(1.5)
ax.spines["bottom"].set_linewidth(1.5)
ax.spines["left"].set_linewidth(1.5)
ax.spines["right"].set_linewidth(1.5)
ax.tick_params('both', length=10, width=1.5, which='major')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
#ax.yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlim(1,15)
plt.ylim(0,15)
if LEGEND_TITLE:
    plt.legend(title=PARAM_2_VARY, frameon=False)
else:
    plt.legend(frameon=False)
plt.savefig('Figures_Editable/batchsum_hydrophobic_AvgDP.svg', format='svg', bbox_inches='tight')
plt.show()
plt.cla()
plt.clf()

# LINE PLOTS ACROSS BATCHES: HYDROPHILIC (PLOTTING CODE)
fig = plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
sns.set_style(style='white') #style='white' or 'darkgrid'
plt.xlabel("Hydrophilic Segment Length")
plt.ylabel("Average frequency per chain")
#Comment following 2 lines to remove autolabelling
for i in range(len(csv_files)):
    x_data = x_phil_all[i]
    y_data = graph_phil_all[i]/df_prop['NC'][i]
    sns.scatterplot(x_data, y_data, label=str(df_prop[PARAM_2_VARY][i]))
    popt, pcov = curve_fit(func, x_data, y_data)
    sns.lineplot(np.linspace(x_data.min(), x_data.max(), 300), func(np.linspace(x_data.min(), x_data.max(), 300), *popt))
    #sns.lineplot(x_phil_all[i], graph_phil_all[i]/df_prop['NC'][i], label= str(int(df_prop['Avg DP'][i])))
## Uncomment the following lines to manually label
# sns.lineplot(x_phil_all[0], graph_phil_all[0]/df_prop['NC'][i], label="5 Monomers")
# sns.lineplot(x_phil_all[1], graph_phil_all[1]/df_prop['NC'][i], label="4 Monomers")
# sns.lineplot(x_phil_all[2], graph_phil_all[2]/df_prop['NC'][i], label="2 Monomers")
# sns.lineplot(x_phil_all[3], graph_phil_all[3]/df_prop['NC'][i], label="3 Monomers")
ax.spines["top"].set_linewidth(1.5)
ax.spines["bottom"].set_linewidth(1.5)
ax.spines["left"].set_linewidth(1.5)
ax.spines["right"].set_linewidth(1.5)
ax.tick_params('both', length=10, width=1.5, which='major')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
#ax.yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlim(1,15)
plt.ylim(0,15)
if LEGEND_TITLE:
    plt.legend(title=PARAM_2_VARY, frameon=False)
else:
    plt.legend(frameon=False)
plt.savefig('Figures/batchsum_hydrophilic_AvgDP.svg', format='svg', bbox_inches='tight')
plt.show()
plt.cla()
plt.clf()

### Specific Segment Search
Customizable section to find specific pattern i.e. right now the implementation looks for hydrophobic segments containing 1 OEGMA as the hypothesis are that these segments are key for proton transport. Another example (not implemented) could be looking at the flanking regions of a specific monomer and analyzing the composition.
- mon2mon: find patterns on a single chain
- seq2seq: box-whisker of distribution of patterns per chain and batch summed statistics (histogram)
- batch2batch: to compare across batches

In [None]:
# specific segment pattern search in sequence (mon2mon level)

###### PARAMS TO VARY ######

#defined by input parameters b_no and s_no
b_no = 0
s_no = 1

############################

#initializing seg_len and seg_array variables

seg_len = 0 # length of segment with 1 OEGMA
seg_array = [] # segment with 1 OEGMA

# counting and recording number of segments with 1 oegma in given sequence
for i in range(len(dfs[b_no].iloc[s_no])):
    this_mon = dfs[b_no][i][s_no]
    if this_mon == None:
        break
    elif int(this_mon) == 2:
        back_counter = 0
        forward_counter = 0
        j = 1
        k = 1
        while (i-j >= 0) and (dfs[b_no][i-j][s_no] != None) and (HLBs[int(dfs[b_no][i-j][s_no])-1] <= HLB_CUTOFF):
            j += 1
            back_counter += 1
        while (i+k <= len(dfs[b_no].iloc[s_no])-1) and (dfs[b_no][i+k][s_no] != None) and (HLBs[int(dfs[b_no][i+k][s_no])-1] <= HLB_CUTOFF):
            k += 1
            forward_counter += 1
        if forward_counter > 1 and back_counter > 1:
            seg_array.append(back_counter + forward_counter + 1)

if len(seg_array) == 0:
    print('0 patterns found.')
else:
    print('Number of Hydrophobic segments with 1 OEGMA: ' + str(len(seg_array)))
    print('Length of Hydrophobic segments with 1 OEGMA: ' + str(seg_array))

In [None]:
### Specific segment pattern search (seq2seq level)
# This cell loops through all batches and does all calculations
# NOTE: can be very slow if NC is very large.

total_array = []
total_seg_len = []
total_seg_array = []

#if taking too long to run, remove 100k file by changing line to:
#for b_no in tqdm_notebook(range(1, len(csv_files))):
for b_no in tqdm_notebook(range(len(csv_files))):
    # initializing counting variables (similar to above cell: mon2mon level)
    oegma_counter = 0
    seg_len_list = []
    seg_array_b = []
    seg_arr = []
    allseg_array = []

    # counting and recording number of segments with 1 oegma per sequence in iterated batch
    for s_no in tqdm_notebook(range(len(dfs[b_no]))):
        seg_arr = []
        for i in range(len(dfs[b_no].iloc[s_no])):
            this_mon = dfs[b_no][i][s_no]
            if this_mon == None:
                break
            elif int(this_mon) == 2:
                back_counter = 0
                forward_counter = 0
                j = 1
                k = 1
                while (i-j >= 0) and (dfs[b_no][i-j][s_no] != None) and (HLBs[int(dfs[b_no][i-j][s_no])-1] <= HLB_CUTOFF):
                    j += 1
                    back_counter += 1
                while (i+k <= len(dfs[b_no].iloc[s_no])-1) and (dfs[b_no][i+k][s_no] != None) and (HLBs[int(dfs[b_no][i+k][s_no])-1] <= HLB_CUTOFF):
                    k += 1
                    forward_counter += 1
                if forward_counter > 1 and back_counter > 1:
                    seg_arr.append(back_counter + forward_counter + 1)
                    allseg_array.append(back_counter + forward_counter + 1)
        seg_len_list.append(len(seg_arr))
        seg_array_b.append(seg_arr)
    
    # append counts and segments for all sequence in iterated batch to larger array
    total_array.append(allseg_array)
    total_seg_len.append(seg_len_list)
    total_seg_array.append(seg_array_b)

#seg_array_b --> list of lengths of Hydrophobic segments with 1 OEGMA per sequence in specified batch
#seg_len_list --> array of number of Hydrophobic segments with 1 OEGMA per sequence in specified batch
 

In [None]:
## Frequency of 5, 8, 10, 13 monomer length hydrophobic segments with 1 OEGMA per chain

# input parameter for batch within which sequences are counted for hydrophobic segments

###### PARAMS TO VARY ######

#defined by input parameters b_no
b_no = 5

num_a = 5 # select which lengths of segments to be averaged and plotted
num_b = 8
num_c = 10
num_d = 13

############################

ARR_a = []
ARR_b = []
ARR_c = []
ARR_d = []

for seq in range(len(total_seg_array[b_no])):
    entry_a = 0
    entry_b = 0
    entry_c = 0
    entry_d = 0
    for lens in range(len(total_seg_array[b_no][seq])):
        if total_seg_array[b_no][seq][lens] == num_a:
            entry_a += 1
        elif total_seg_array[b_no][seq][lens] == num_b:
            entry_b += 1
        elif total_seg_array[b_no][seq][lens] == num_c:
            entry_c += 1
        elif total_seg_array[b_no][seq][lens] == num_d:
            entry_d += 1
    ARR_a.append(entry_a)
    ARR_b.append(entry_b)
    ARR_c.append(entry_c)
    ARR_d.append(entry_d)
    
        
data_a = np.zeros(max(ARR_a))
for i in range(len(ARR_a)):
    for j in range(max(ARR_a)):
        if ARR_a[i] == j:
            data_a[j] += 1
            
data_b = np.zeros(max(ARR_b))
for i in range(len(ARR_b)):
    for j in range(max(ARR_b)):
        if ARR_b[i] == j:
            data_b[j] += 1
            
data_c = np.zeros(max(ARR_c))
for i in range(len(ARR_c)):
    for j in range(max(ARR_c)):
        if ARR_c[i] == j:
            data_c[j] += 1
            
data_d = np.zeros(max(ARR_d))
for i in range(len(ARR_d)):
    for j in range(max(ARR_d)):
        if ARR_d[i] == j:
            data_d[j] += 1

plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
plt.xlabel("Counts of hydrophobic with 1 OEGMA segments of length X")
plt.ylabel("Average Frequency per chain")
sns.set_style(style = 'white')
sns.lineplot(x = np.array(np.linspace(0, len(data_a)-1, num = len(data_a))), y = data_a/df_prop['NC'][b_no], label = str(num_a))
sns.lineplot(x = np.array(np.linspace(0, len(data_b)-1, num = len(data_b))), y = data_b/df_prop['NC'][b_no], label = str(num_b))
sns.lineplot(x = np.array(np.linspace(0, len(data_c)-1, num = len(data_c))), y = data_c/df_prop['NC'][b_no], label = str(num_c))
sns.lineplot(x = np.array(np.linspace(0, len(data_d)-1, num = len(data_d))), y = data_d/df_prop['NC'][b_no], label = str(num_d))
ax.spines["top"].set_linewidth(2)
ax.spines["bottom"].set_linewidth(2)
ax.spines["left"].set_linewidth(2)
ax.spines["right"].set_linewidth(2)
ax.tick_params('both', length=10, width=2, which='major')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
#ax.yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlim(0,10)
plt.ylim(0,0.9)
plt.legend(title="X =", frameon=False)
plt.savefig('Figures/seq2seq_segsearch_DP'+str(df_prop['Avg DP'][b_no])+'.pdf')
plt.show()


In [None]:
#analyzes specific segment search data (batch2batch level)

###### PARAMS TO VARY ######

# input parameters for batch-level analysis

PARAM_2_VARY = 'Batch Name' # name of whatever variable is you are trying to plot & analyze (i.e. legend labels)
LEGEND_TITLE = False # if you have numerical legend headers i.e. different DPs or NCs set this to True

############################


Avg_NumSeg = np.mean(np.array(seg_len_list))
StDev_NumSeg = np.std(np.array(seg_len_list))
Avg_LenSeg = []
StDev_LenSeg = []

for i in range(len(seg_array_b)):
    if seg_array_b[i] != []:
        avg = np.mean(np.array(seg_array_b[i]))
        sdev = np.std(np.array(seg_array_b[i]))
        Avg_LenSeg.append(avg)
        StDev_LenSeg.append(sdev)

print("Average Number of Hydrophobic segments with 1 Oegma per sequence in batch: " + str(Avg_NumSeg))
print("StDev of Number of Hydrophobic segments with 1 Oegma per sequence in batch: " + str(StDev_NumSeg))
#Uncomment following lines for length data per sequence
#print("Average lengths of Hydrophobic segments with 1 OEGMA per sequence in batch: " + str(Avg_LenSeg))
#print("StDev of lengths of Hydrophobic segments with 1 Oegma per sequence in batch: " + str(StDev_LenSeg))

plt.cla()   # Clear axis
plt.clf()   # Clear figure
plt.figure(num=None, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')
ax = fig.add_subplot(111)
sns.set_style(style='white') #style='white' or 'darkgrid'
plt.xlabel("Hydrophobic with 1 OEGMA Segment Length")
plt.ylabel("Density")
plt.xlim(0,65)
sns.set_style(style = 'white')
#Comment following lines to remove autolabelling
for i in range(len(csv_files)):
    sns.distplot(total_array[i], label=str(df_prop[PARAM_2_VARY][i]), hist = False, kde_kws={'bw':1})
    #sns.distplot(total_array[i], label=str(int(df_prop.iloc[i]['Avg DP'])),hist = False, kde_kws={'bw':1})
    plt.legend()
ax.spines["top"].set_linewidth(2)
ax.spines["bottom"].set_linewidth(2)
ax.spines["left"].set_linewidth(2)
ax.spines["right"].set_linewidth(2)
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
#ax.yaxis.set_major_locator(MaxNLocator(integer=True))
## Uncomment to label manually
#sns.distplot(total_array[0], label="5 Monomers", hist=False, kde_kws={'bw':1})
# sns.distplot(total_array[1], label="4 Monomers",hist = False)
# sns.distplot(total_array[2], label="2 Monomers",hist = False)
# sns.distplot(total_array[3], label="3 Monomers",hist = False)
# plt.legend()
if LEGEND_TITLE:
    plt.legend(title=PARAM_2_VARY, frameon=False)
else:
    plt.legend(frameon=False)
plt.savefig('Figures/batchsum_segsearch_DP.pdf')
plt.show()

### Sliding Window Analysis
EXTRA VARIABLE: window_length (note window_length >= FILT otherwise you will encounter errors if you try average a window longer than the shortest chain in a batch)
- mon2mon: sliding window on a single chain --> hydropathy plot
- seq2seq: box-whisker plot showing statistics along the positions of a chain
- batch2batch: seq2seq plot averaged over positions and compared across batches

In [None]:
### SLIDING WINDOW ANALYSIS OF POLYMERS
# Set window length:
WIN_LENGTH = 9

# for monomer-level plot
batch_num = 5
sequence_num = 5

# create a new list bins, to replace amino acids by corresponding hydrophobicity over the whole sequence 
def translate(chars, HLBs):
    bins = []
    for i in range(len(chars)):
        bins.append(HLBs[int(chars[i])-1])
    return bins

# iterate an iterable sequence by the number of size per time and create a list of sliding_Arrays for that
def window( iterable, size ):
    sliding_array=[]
    iters = tee(iterable, int(size))
    for i in range(1, int(size)):
        for each in iters[i:]:
            next(each, None)
    for each in zip(*iters):
        sliding_array.append(list(each))
    return sliding_array

# create a dictionary to keep the medium coordinate and average value for each sliding window
def seg_analysis(sliding_arrays,win_length):
    pos_val={}  
    int_pos = int(win_length) // 2 +1
    for each in sliding_arrays:
        ave_value=np.mean(each)
        pos_val[int_pos]=ave_value
        int_pos+=1
    return pos_val

#input a dictionary including positions and average value of a sliding arrays, output a plot
def win_plot(pos_val, win_length, xlim=None, figratio=None, inverse=False):
    # sorted by key, return a list of tuples
    lists = list(sorted(pos_val.items())) 
    # unpack a list of pairs into two tuples
    x, y=zip(*lists)
    # Create a Figure
    fig = plt.figure(figsize=(5.4*1.4, 4*1.4))
    # Set up Axes
    ax= fig.add_subplot(111)
    if inverse:
        plt.ylim((5,13))
        plt.gca().invert_yaxis()
    np.savetxt(str(batch_num) + '_WIN' + str(WIN_LENGTH) + '.csv', np.transpose(np.vstack((x,y))), delimiter=',', header="x,y")
    ax.scatter(x, y)
    ax.plot(x, y)
    ax.axhline(y=HLB_CUTOFF, color='k', linestyle='--')
    if xlim != None:
        ax.set_xlim(0, xlim)
    ax.set_ylim(5,14.5)
    ax.set(xlabel="Central Monomer Sequence Position", ylabel="Window Average HLB Value")
    ax.spines["top"].set_linewidth(1.5)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)
    ax.spines["right"].set_linewidth(1.5)
    ax.tick_params('both', length=10, width=1.5, which='major')
    plt.tight_layout()
    #ax.set(title= "Hydropathy plot of " + str(pro_name) + " ,window length = " + str(win_length), xlabel="Center AA in AAs window", ylabel="Average Hydrophobicity/Window")
    #plt.savefig('plot.eps', format='eps')
    plt.savefig("Figures/slidingwindow_MMA_EHMA_" + str(df_prop.iloc[batch_num]['MR1']) + "_WinLength" + str(WIN_LENGTH) + "_DP300.pdf",transparent = True)
    plt.show()

this_seq = [int(x) for x in dfs[batch_num].iloc[sequence_num] if x != None]
bins = translate(this_seq, HLBs)
sliding_arrays = window(bins, WIN_LENGTH)
pos_val = seg_analysis(sliding_arrays, WIN_LENGTH)
win_plot(pos_val, WIN_LENGTH, 100)

In [None]:
#get HLB files of a particular window length

dfs_hlb = []
seq_lens_hlb = []
for d in tqdm_notebook(range(len(csv_files))):
    mat = list()
    for seq in tqdm_notebook(seqs[d]):
        bins = translate(seq, HLBs)
        sliding_arrays = window(bins, WIN_LENGTH)
        pos_val = seg_analysis(sliding_arrays, WIN_LENGTH)
        lists = list(sorted(pos_val.items()))
        x,y = zip(*lists)

        mat.append(y)
    dfs_hlb.append(pd.DataFrame(mat))
    seq_lens_hlb.append(row_size)

In [None]:
batch2batch_arr = []
for batch_ind in tqdm_notebook(range(len(csv_files))):

    rows_size = len(dfs_hlb[batch_ind])
    print("rows size:", rows_size)
    cols_size = len(dfs_hlb[batch_ind].columns)
    print("cols size:", cols_size)

    # Box Whisker Plot of Sliding Window Analysis Data: Setup (Data Reduction)

    pos_data = list()
    datacol_array = np.zeros(5)
    for i in range(cols_size):
        datacol = [y_ for y_ in dfs_hlb[batch_ind][i] if y_ > 0]
        datacol_array[4] += np.amax(np.array(datacol))
        datacol_array[0] += np.amin(np.array(datacol))
        datacol_array[2] += np.mean(np.array(datacol))
        datacol_array[3] += np.percentile(np.array(datacol), 75)
        datacol_array[1] += np.percentile(np.array(datacol), 25)
        pos_data.append(datacol)
    
    batch2batch_arr.append(datacol_array/cols_size)

    fig = plt.figure(figsize =(15, 7))
    ax = fig.add_axes([0, 0, 1, 1])
    ax.boxplot(pos_data, whis=(0,100))
    plt.xlabel("Sequence position")
    plt.ylabel("Window average HLB")
    plt.xticks([])
    plt.savefig('Figures/boxwhisker_slidingwindow_MMA_EHMA_'+str(df_prop.iloc[batch_ind]['MR1'])+'_'+str(df_prop.iloc[batch_ind]['MR3'])+'.pdf')
    plt.show()
    

In [None]:
## Code for figure generation: Box Whisker Plot

fig = plt.figure(figsize = (11, 6))
ax = fig.add_axes([0, 0, 1, 1])
ax.boxplot(batch2batch_arr, whis=(0,100))
ax.spines["top"].set_linewidth(1.5)
ax.spines["bottom"].set_linewidth(1.5)
ax.spines["left"].set_linewidth(1.5)
ax.spines["right"].set_linewidth(1.5)
ax.tick_params('both', length=10, width=1.5, which='major')
plt.xlabel("MMA:EHMA Variance")
plt.ylabel("Window average HLB Value")
xtick_a = [1]
xtick_b = [str(df_prop.iloc[0]['MR1'])+':'+str(df_prop.iloc[0]['MR3'])]
for i in range(2,len(csv_files)+1):
    xtick_a.append(i)
    xtick_b.append(str(df_prop.iloc[i-1]['MR1'])+':'+str(df_prop.iloc[i-1]['MR3']))
plt.xticks(xtick_a, xtick_b)
plt.ylim(5,14.5)
plt.savefig('Figures/boxwhisker_slidingwindow_NChains_DP300.pdf', bbox_inches='tight', dpi=100)
plt.show()