## Dependencies

In [None]:
import pandas as pd
import numpy as np
import logomaker

## Functions

In [None]:
# function to construct probability matrix, and subsequently calculate information content
def construct_PWM(df,mut_type):
    df_mut = df[(df['New_ID'].str.find('MutDoub') > -1)].copy()
    df_mut_calc = df_mut[['ID','Amino_acid_Sequence','NucleotideSeq','New_ID','Seq_len']].copy())#get all variants of a specific category
    for col in df_mut.columns.tolist()[4:13]:
        df_mut_calc[col+'_normalized'] = df_mut[col].apply(lambda x: (x / df_mut[col].sum()))

    # create empty arrays
    freq_arr_mCherry = np.zeros((7,len(aa_list))) #20 amino acids, 7 aa
    freq_arr_GFPneg = np.zeros((7,len(aa_list))) #20 amino acids, 7 aa
    
    for i,row in df_mut_calc.iterrows():
        for x, aa in enumerate(row['Amino_acid_Sequence']): #get average frequency probability
            freq_arr_mCherry[x,aa_list.index(aa)] = freq_arr_mCherry[x,aa_list.index(aa)] + (row['mCherry-R1_normalized'] + row['mCherry-R2_normalized'] + row['mCherry-R3_normalized'])/3
            freq_arr_GFPneg[x,aa_list.index(aa)] = freq_arr_GFPneg[x,aa_list.index(aa)] + (row['GFPneg-R1_normalized'] + row['GFPneg-R2_normalized'] + row['GFPneg-R3_normalized'])/3
    
    PWM = np.zeros((7,len(aa_list)))
    for j in range(len(aa_list)):
        for i in range(7):
            PWM[i,j] = freq_arr_GFPneg[i,j] * np.log2(freq_arr_GFPneg[i,j] / freq_arr_mCherry[i,j]) #calculate information content
    
    PWM[PWM < 0] = 0
    
    bits = pd.DataFrame(data = PWM, columns = aa_list)
    return bits

# function to plot PWM using logomaker
def plot_PWM(bits,output_dir)
    cm = 1/2.54
    fig,ax=plt.subplots()
    fig.set_size_inches(10*cm,7*cm)
    
    logo = logomaker.Logo(bits,ax=ax,
                              vpad=.1,
                              color_scheme='chemistry',
                              font_name='Arial')
    # style using Logo methods
    logo.style_spines(visible=False)
    logo.style_spines(spines=['left', 'bottom'], visible=True)
    logo.ax.set_ylim([0,0.9])
    
    # style using Axes methods
    logo.ax.xaxis.set_ticks_position('none')
    logo.ax.set_ylabel("Bits")
    fig.tight_layout()
    plt.savefig(output_dir, bbox_inches='tight')

## Example lines to run code

In [None]:
aa_list = ['E','D','R','H','K','F','Y','W','S','Q','T','N','C','P','A','G','M','V','I','L']
df_dir = #df containing variants and counts information processed by analyze_DMS_data.ipynb
df = pd.read_csv(df_dir,sep='\t')
df['Seq_len'] = df['Amino_acid_Sequence'].apply(lambda x: len(x)) #calculate sequence length of variant

In [None]:
bits = construct_PWM(df,mut_type) #example of mut_type is InsSing (i.e. single insertion) etc

output_dir = #dir and filename to output logo plot to
plot_PWM(bits,output_dir)