# Import Data and Metadata

In [1]:
# import modules
import pandas as pd

In [2]:
# import data
df = pd.read_csv('dataframes/pos_df.csv')
df.sample(3)

Unnamed: 0,Sample Name,CAS9-A,CAS9-A.1,CAS9-B,CAS9-B.1,CAV_A,CAV_A.1,CAV_B,CAV_B.1,CAVIN_A,...,SPTLC_B,SPTLC_B.1,UGCG-A,UGCG-A.1,UGCG-B,UGCG-B.1,WT_A,WT_A.1,WT_B,WT_B.1
2591,PS O-38:7 (FA 22:6),471.5542,491.1332,495.6704,530.5287,867.7554,815.7106,794.2523,703.3807,0.0,...,388.2414,459.6613,845.1921,848.9416,649.6467,683.4475,381.2033,514.3158,219.3906,559.8027
6129,TAG 56:4+NH4 (-FA 19:2 (NH4)),0.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
460,"GD3 28:2;3 (LCB 18:1;2-H2O,LCB 18:0;3-2H2O)",26.2056,0.0,18.8291,0.0,0.0,32.8027,27.0,0.0,25.2184,...,0.0,0.0,0.0,24.0736,0.0,0.0,0.0,0.0,0.0,13.9822


In [3]:
# import lipid metadata
df_meta = pd.read_csv('dataframes/pos_lipids_df_meta.csv')
df_meta.sample(3)

Unnamed: 0,Sample Name,Head Group,Acyl Chain Length,Unsaturation
4182,"GM3 30:4;2 (LCB 18:1;2-2H2O,LCB 18:0;3-3H2O)",GM3,30,4
454,"GD2 31:4;4 (LCB 18:1;2-H2O,LCB 18:0;3-2H2O)",GD2,31,4
2543,PC O-38:2 (PC),PC O,38,2


In [4]:
# import experiment metedata
df_exps = pd.read_csv('dataframes/pos_df_exps.csv')
df_exps.sample(3)

Unnamed: 0,Exp,Mutation
28,RAJU-Blank,RAJU
22,CERS5-B,CERS5
24,Flot2-A,Flot2


## Get Chain Lengths

We want to find the amounts of each chain length for each experiment

In [7]:
# merge with metadata to get chain lengths
dfa = df_meta[['Sample Name', 'Acyl Chain Length']].merge(df, on='Sample Name').set_index('Sample Name')

# find concentration of each chain length
lengths = dfa.groupby('Acyl Chain Length').sum()
lengths.head()

Unnamed: 0_level_0,CAS9-A,CAS9-A.1,CAS9-B,CAS9-B.1,CAV_A,CAV_A.1,CAV_B,CAV_B.1,CAVIN_A,CAVIN_A.1,...,SPTLC_B,SPTLC_B.1,UGCG-A,UGCG-A.1,UGCG-B,UGCG-B.1,WT_A,WT_A.1,WT_B,WT_B.1
Acyl Chain Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,21.6224,0.0,234.634,276.7446,0.0,22.0,...,40.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
12,35.0,35.0,56.9083,33.0,32.0,44.6469,96.8573,106.0,0.0,30.0,...,22.0,0.0,0.0,30.0,49.0,57.0,0.0,0.0,37.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.402,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,101.0,54.0,56.0,28.0,3514.0802,3209.3048,2387.564,2308.1189,42.0,73.0,...,2501.8477,2379.0624,38.0,69.0,23.0,83.0,0.0,0.0,0.0,0.0
18,3151.6889,3196.6663,3517.1031,3100.7914,9804.0135,6799.5276,4619.7055,4772.9836,3882.7404,3969.9773,...,2950.9891,3296.1865,3545.4834,3870.7,3903.5885,3691.8628,5067.7189,4964.1132,3803.6267,3579.8054


In [8]:
# get chain lengths as fraction of each sample (down the columns)
lengths = lengths/lengths.sum()
lengths.head()

Unnamed: 0_level_0,CAS9-A,CAS9-A.1,CAS9-B,CAS9-B.1,CAV_A,CAV_A.1,CAV_B,CAV_B.1,CAVIN_A,CAVIN_A.1,...,SPTLC_B,SPTLC_B.1,UGCG-A,UGCG-A.1,UGCG-B,UGCG-B.1,WT_A,WT_A.1,WT_B,WT_B.1
Acyl Chain Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,6e-06,0.0,7.4e-05,8.6e-05,0.0,5e-06,...,1.3e-05,1.5e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06
12,8e-06,8e-06,1.1e-05,7e-06,9e-06,1.2e-05,3e-05,3.3e-05,0.0,7e-06,...,7e-06,0.0,0.0,9e-06,1.5e-05,1.7e-05,0.0,0.0,9e-06,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5e-06,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,2.2e-05,1.2e-05,1.1e-05,6e-06,0.000968,0.000893,0.000748,0.00072,1e-05,1.8e-05,...,0.000791,0.000692,1.1e-05,2e-05,7e-06,2.5e-05,0.0,0.0,0.0,0.0
18,0.000679,0.000705,0.000695,0.000638,0.0027,0.001892,0.001448,0.001489,0.000961,0.00099,...,0.000933,0.000959,0.001044,0.001137,0.001193,0.001091,0.001064,0.00103,0.000881,0.000812


In [13]:
# get mean chain lengths for each protein
ptns = lengths.T.reset_index().rename(columns={'index': 'Exp'}).merge(df_exps, on='Exp').set_index('Exp')
ptns = ptns.groupby('Mutation').mean().T.reset_index().rename(columns={'index':'Acyl Chain Length'})

ptns = ptns.drop(columns=['RAJU'])
ptns.head()

Mutation,Acyl Chain Length,CAS9,CAV,CAVIN,CERS2-1g,CERS2_mg,CERS5,Flot2,SPTLC,UGCG,WT
0,10,0.0,4.1e-05,1e-06,0.0,4.841222e-07,0.0,0.0,2.7e-05,0.0,1e-06
1,12,8e-06,2.1e-05,4e-06,7e-06,0.0,0.0,1e-06,4e-06,1e-05,2e-06
2,14,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,1.3e-05,0.000832,1.8e-05,8e-06,0.0001889634,0.000364,1.3e-05,0.000796,1.6e-05,0.0
4,18,0.000679,0.001882,0.001136,0.001275,0.0009628772,0.001406,0.000392,0.000997,0.001116,0.000947


In [14]:
# get std dev of chain lengths for each protein
std = lengths.T.reset_index().rename(columns={'index': 'Exp'}).merge(df_exps, on='Exp').set_index('Exp')
std = std.groupby('Mutation').std().T.reset_index().rename(columns={'index':'Acyl Chain Length'})

std = std.drop(columns=['RAJU'])
std.head()

Mutation,Acyl Chain Length,CAS9,CAV,CAVIN,CERS2-1g,CERS2_mg,CERS5,Flot2,SPTLC,UGCG,WT
0,10,0.0,4.5e-05,3e-06,0.0,9.682444e-07,0.0,0.0,1.6e-05,0.0,3e-06
1,12,2e-06,1.2e-05,3e-06,8e-06,0.0,0.0,3e-06,5e-06,8e-06,4e-06
2,14,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16,7e-06,0.000118,6e-06,6e-06,0.0003559915,0.000413,9e-06,7.5e-05,8e-06,0.0
4,18,3e-05,0.000581,0.000186,6.2e-05,7.282461e-05,0.00013,0.000423,7.5e-05,6.4e-05,0.00012


# Test graphs for CAS9
We want to graph the distribution of chain lengths for each experiment

In [15]:
# import modules
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

In [16]:
# # try with matplotlib
# plt.figure(figsize=(8,8))
# plt.bar(
#     x=lengths.index,
#     height=lengths.iloc[:,0]
# )

# plt.title('Chain Length Distribution for PosMSMALL-CAS9-A')
# plt.xlabel('Chain Lengths')
# plt.ylabel('Count')
# plt.show()

In [17]:
# move 'Acyl Chain Length' from index into column so altair can access
l = lengths.reset_index()

a = alt.Chart(l).mark_bar(color='#008000', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAS9-A:Q').title('Concentration')
).properties(
    title='CAS9-A'
)

b = alt.Chart(l).mark_bar(opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAV_A:Q').title('Concentration'),
).properties(
    title=('CAV_A')
)

c = alt.Chart(l).mark_bar(opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAVIN_A')
).properties(
    title='CAVIN_A'
)

# a | b | c

In [18]:
# get chain length distributions for CAS9
l_4 = l.iloc[:,:5].melt('Acyl Chain Length', var_name='exps')

In [19]:
# alt.Chart(l_4).mark_bar().encode(
#     x='Acyl Chain Length:O',
#     xOffset='exps',
#     y='value:Q',
#     color='exps:N',
#     tooltip=['Acyl Chain Length', 'exps', 'value']
# ).properties(
#     width=600,
#     height=400,
#     title="Chain Length Distribution for CAS9"
# )

# Get chain length distributions for whole dataset

In [None]:
# get chain length distributions across all proteins
ptns_long = ptns.melt('Acyl Chain Length', var_name='protein')

In [None]:
click = alt.selection_point(fields=['protein'])

grouped = alt.Chart(ptns_long).mark_bar().encode(
    x='Acyl Chain Length:O',
    xOffset='protein',
    y=alt.Y('value:Q').title('Fraction (mean)'),
    color= alt.condition(click, alt.Color('protein:N'), alt.value('lightgray')),
    tooltip=['Acyl Chain Length', 'protein', 'value']
).properties(
    width=3000,
    title="Average Chain Length Distributions"
).add_params(
    click
)

# grouped

In [None]:
a = alt.Chart(ptns).mark_bar(color='#440154', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAS9:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'CAS9']
).properties(
    title='CAS9'
)

b = alt.Chart(ptns).mark_bar(color='#482173', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAV:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'CAV']
).properties(
    title='CAV'
)

c = alt.Chart(ptns).mark_bar(color='#433e85', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAVIN:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'CAVIN']
).properties(
    title='CAVIN'
)

d = alt.Chart(ptns).mark_bar(color='#38588c', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CERS2-1g:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'CERS2-1g']
).properties(
    title='CERS2-1g'
)

e = alt.Chart(ptns).mark_bar(color='#2d708e', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CERS2_mg:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'CERS2_mg']
).properties(
    title='CERS2_mg'
)

f = alt.Chart(ptns).mark_bar(color='#25858e', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CERS5:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'CERS5']
).properties(
    title='CERS5'
)

g = alt.Chart(ptns).mark_bar(color='#1e9b8a', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('Flot2:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'Flot2']
).properties(
    title='Flot2'
)

i = alt.Chart(ptns).mark_bar(color='#52c569', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('SPTLC:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'SPTLC']
).properties(
    title='SPTLC'
)

j = alt.Chart(ptns).mark_bar(color='#86d549', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('UGCG:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'UGCG']
).properties(
    title='UGCG'
)

k = alt.Chart(ptns).mark_bar(color='#c2df23', opacity=0.5).encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('WT:Q').title('Fraction (mean)'),
    tooltip=['Acyl Chain Length', 'WT']
).properties(
    title='WT'
)

layered = a + b + c + d + e + f + g + i + j + k
# layered

In [None]:
a = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAS9:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'CAS9']
).properties(
    title='CAS9'
)

b = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAV:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'CAV']
).properties(
    title='CAV'
)

c = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CAVIN:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'CAVIN']
).properties(
    title='CAVIN'
)

d = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CERS2-1g:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'CERS2-1g']
).properties(
    title='CERS2-1g'
)

e = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CERS2_mg:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'CERS2_mg']
).properties(
    title='CERS2_mg'
)

f = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('CERS5:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'CERS5']
).properties(
    title='CERS5'
)

g = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('Flot2:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'Flot2']
).properties(
    title='Flot2'
)

i = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('SPTLC:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'SPTLC']
).properties(
    title='SPTLC'
)

j = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('UGCG:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'UGCG']
).properties(
    title='UGCG'
)

k = alt.Chart(ptns).mark_bar().encode(
    x='Acyl Chain Length:Q',
    y=alt.Y('WT:Q').title('Fraction (mean)').scale(domain=[0,.26]),
    tooltip=['Acyl Chain Length', 'WT']
).properties(
    title='WT'
)

chain_lengths = (a | b | c) & (d | e | f) & (g | i | j) & k
# chain_lengths

# Include error bars and density plot with Graphs

In [20]:
ptnstd = lengths.T.reset_index().rename(columns={'index': 'Exp'}).merge(df_exps, on='Exp').drop(columns=['Exp'])
ptnstd = ptnstd[ptnstd['Mutation'] != 'RAJU']
ptnstd = ptnstd.melt('Mutation', var_name='Acyl Chain Length', value_name='Fraction')
ptnstd.head()

err = alt.Chart(ptnstd).mark_errorbar(extent='stdev').encode(
    x='Acyl Chain Length:O',
    xOffset='Mutation:N',
    y='Fraction:Q'
)

bar = alt.Chart(ptnstd).mark_bar().encode(
    x='Acyl Chain Length:O',
    xOffset='Mutation:N',
    y='mean(Fraction):Q',
    color='Mutation:N'
).properties(
    width=3000
)

grouped_err = bar + err
grouped_err

grouped_err.save('graphs/pos_mode_whole_dataset/chain_length_graphs/updated_ChainLengthByProtein_GroupedBarGraph_withPS.png', dpi=350)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
import matplotlib.pyplot as plt
chart_list = []
i = 0
for protein in ptnstd['Mutation'].unique():
    source = ptnstd[ptnstd['Mutation'] == protein]
    
    # create error bars
    err = alt.Chart(source).mark_errorbar(extent='stdev').encode(
        x='Acyl Chain Length:Q',
        y='Fraction:Q',
    )
    
    # create histogram
    bar = alt.Chart(source).mark_bar().encode(
        x='Acyl Chain Length:Q',
        y=alt.Y('mean(Fraction):Q').scale(domain=[0,.26]),
    ).properties(title=protein)
    
    # create density plot
    x = ptns['Acyl Chain Length']
    y = ptns[protein]

    plt.xlim(10,70)
    sns.set(rc={"figure.facecolor": "white"})
    a = sns.kdeplot(x=x, weights=y, label=protein)
    plt.legend()
    
    x, y = a.lines[i].get_data()
    # print(len(a.lines))
    s = pd.DataFrame({'x':x, 'density':y})
    
    dens = alt.Chart(s).mark_line(color='red').encode(
        x=alt.X('x').scale(domain=[0,60], clamp=True).title('Acyl Chain Length'),
        y=alt.Y('density').title('density').scale(domain=[0,.1]),
        tooltip=['x','density']
    )
    
    # append to chart list
    chart = alt.layer(bar+err, dens).resolve_scale(y='independent')
    
    chart_list.append(chart)
    
    i +=1
    

side_by_side_err = alt.hconcat(*chart_list[:3]) & alt.hconcat(*chart_list[3:6]) & alt.hconcat(*chart_list[6:9]) & chart_list[9]

# side_by_side_err2 = alt.hconcat(*chart_list[:4]) & alt.hconcat(*chart_list[4:8]) & alt.hconcat(*chart_list[8:])

# side_by_side_err2
plt.savefig('graphs/pos_mode_whole_dataset/chain_length_graphs/updated_ChainLengthByProtein_LayeredDensityPlots_WithPS.png', dpi=350)

# Fit Gaussian

In [34]:
from __future__ import print_function 
import numpy as np 
import matplotlib.pyplot as plt 
from scipy.optimize import curve_fit 

In [None]:
# # data
# x = ptns[(ptns['Acyl Chain Length'] % 2 == 0)]['Acyl Chain Length']
# y = ptns['CAS9'].loc[x.index]

# n = len(x)                          #the number of data
# mean = sum(x*y)/n                   #note this correction
# sigma = sum(y*(x-mean)**2)/n        #note this correction

# def gaus(x,a,x0,sigma):
#     return a*np.exp(-(x-x0)**2/(2*sigma**2))

# popt,pcov = curve_fit(gaus,x,y,p0=[1,mean,sigma])

# plt.plot(x,y,'b+:',label='data')
# plt.plot(x,gaus(x,*popt),'ro:',label='fit')
# plt.legend()
# plt.show()
# print(popt)

In [None]:
n = sum(y)                         
mean = sum(x*y)/n                 
sigma = sum(y*(x-mean)**2)/n 

print(n, mean, sigma)

In [None]:
chart_list = []
i = 0
def gaus(x,a,x0,sigma):
        return a*np.exp(-(x-x0)**2/(2*sigma**2))
    
for protein in ptnstd['Mutation'].unique():
    source = ptnstd[ptnstd['Mutation'] == protein]
    
    # create error bars
    err = alt.Chart(source).mark_errorbar(extent='stdev').encode(
        x='Acyl Chain Length:Q',
        y='Fraction:Q',
    )
    
    # create histogram
    bar = alt.Chart(source).mark_bar().encode(
        x='Acyl Chain Length:Q',
        y=alt.Y('mean(Fraction):Q').scale(domain=[0,.26]),
    ).properties(title=protein)
    
    # create gaussian
    x = ptns[(ptns['Acyl Chain Length'] % 2 == 0)]['Acyl Chain Length']
    y = ptns[protein].loc[x.index]
    
    # expected vals
    n = sum(y)                         
    mean = sum(x*y)/n                  
    sigma = sum(y*(x-mean)**2)/n 
    
    popt,pcov = curve_fit(gaus,x,y,p0=[1,mean,sigma])
    
    xx = np.linspace(10, 60, 500)
    
    s = pd.DataFrame({'Acyl Chain Length':xx, 'y':gaus(xx, *popt)})
    
    gauss = alt.Chart(s).mark_line(color='red').encode(
        x='Acyl Chain Length:Q',
        y='y:Q'
    )
    
    text = alt.Chart({'values':[{}]}).mark_text(
        align="left", baseline="top"
    ).encode(
        x=alt.value(5),  # pixels from left
        y=alt.value(5),  # pixels from top
        text=alt.value([f"a: {popt[0]:.3f}", f"mean: {popt[1]:.3f}", f"sigma: {popt[2]:.3f}"])
    )
    
    # append to chart list
    chart = bar + err + gauss + text
    
    chart_list.append(chart)
    
    i +=1
    

side_by_side_err = alt.hconcat(*chart_list[:3]) & alt.hconcat(*chart_list[3:6]) & alt.hconcat(*chart_list[6:9]) & chart_list[9]

side_by_side_err

# side_by_side_err2 = alt.hconcat(*chart_list[:4]) & alt.hconcat(*chart_list[4:8]) & alt.hconcat(*chart_list[8:])

# side_by_side_err2

# Odds Only

In [None]:
odds_long = ptnstd[ptnstd['Acyl Chain Length'] % 2 != 0]
odds_long.head()

In [None]:
chart_list = []
for protein in odds_long['Mutation'].unique():
    source = odds_long[odds_long['Mutation'] == protein]
    
    # create error bars
    err = alt.Chart(source).mark_errorbar(extent='stdev').encode(
        x='Acyl Chain Length:N',
        y='Fraction:Q',
    )
    
    # create histogram
    bar = alt.Chart(source).mark_bar().encode(
        x='Acyl Chain Length:N',
        y=alt.Y('mean(Fraction):Q').scale(domain=[0,.04]),
    ).properties(title=protein)
    
    # append to chart list
    chart = bar + err
    
    chart_list.append(chart)
    
# odds_distribution = alt.hconcat(*chart_list[:3]) & alt.hconcat(*chart_list[3:6]) & alt.hconcat(*chart_list[6:9]) & chart_list[9]
# odds_distribution

odds_distribution2 = alt.hconcat(*chart_list[:4]) & alt.hconcat(*chart_list[4:8]) & alt.hconcat(*chart_list[8:])
odds_distribution2.save('graphs/pos_mode_whole_dataset/chain_length_graphs/updated_OddChainLengths_Histogram_WithPS.png', ppi=300)

In [None]:
# chart_list[0].save('graphs/presentation_figures/cas9_odds.png', ppi=300)
# chart_list[-1].save('graphs/presentation_figures/wt_odds.png', ppi=300)
# chart_list[2].save('graphs/presentation_figures/cavin_odds.png', ppi=300)
# chart_list[7].save('graphs/presentation_figures/sptlc_odds.png', ppi=300)

In [None]:
# create df with odd & even percentages (df_lengths is already normalized down the column)

# create Acyl Chain Length column
l_o = lengths.reset_index().rename(columns={'index':'Acyl Chain Length'})

# separate into odd & even
odds = l_o[l_o['Acyl Chain Length'] % 2 != 0]
evens = l_o[l_o['Acyl Chain Length'] % 2 == 0]

# find the odd & even percentages for each sample and add to df
df_odds = pd.concat([odds.sum()[1:], evens.sum()[1:]], axis=1, keys=['odd', 'even'])

# merge with df_exps to sort by protein rather than sample
df_odds = df_odds.reset_index().rename(columns={'index':'Exp'})
df_odds = df_exps.merge(df_odds, on='Exp').drop(columns=['Exp'])

# drop RAJU
df_odds = df_odds[df_odds['Mutation'] != 'RAJU']

df_odds.to_csv('dataframes/pos_df_odds.csv', index=False)

In [None]:
# long version of df_odds
df_odds_long = df_odds.melt('Mutation')
df_odds_long.head()

In [None]:
# create donut chart
chart_list = []

for ptn in df_odds_long['Mutation'].unique():
    source = df_odds_long[df_odds_long['Mutation'] == ptn]
    
    donut = alt.Chart(source).mark_arc().encode(
        theta='mean(value):Q',
        color='variable:N'
    ).properties(title={'text': [], 'subtitle':ptn})
    
    text 
    
    chart_list.append(donut)
    
o_donut = alt.hconcat(*chart_list[:3]) & alt.hconcat(*chart_list[3:6]) & alt.hconcat(*chart_list[6:9]) & chart_list[9]
o_donut = o_donut.properties(title={'text':['Percentage of Odd vs Even Chain Lengths']})
# o_donut

In [None]:
# fraction of odd chain lengths, bar graph
b = alt.Chart(df_odds).mark_bar().encode(
    x='Mutation:N',
    y='mean(odd):Q'
).properties(title='Fraction of Odd Chain Lengths')

# error bars
err = alt.Chart(df_odds).mark_errorbar(extent='stdev').encode(
    x='Mutation:N',
    y=alt.Y('odd:Q').title('Fraction'),
)

(b + err).configure(background='white')
# .save('graphs/pos_mode_whole_dataset/chain_length_graphs/updated_OddChainLengths_Bar_WithPS.png', ppi=300)

# Chain Lengths over 50

In [None]:
c = df_meta.merge(df, on="Sample Name")
c.head()

In [None]:
d = c[c['Acyl Chain Length'] >= 50]
d.head()

In [None]:
# relative values of head groups for chain lengths over 50
d['Head Group'].value_counts(normalize=True)

# Fit Bimodal Gaussian

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
protein = 'CAV'
print(protein)
x1 = x[x < 50]
y1 = ptns[protein].loc[x1.index]
x2 = x[x >= 50]
y2 = ptns[protein].loc[x2.index]
    
n1 = sum(y1)                         
mean1 = sum(x1*y1)/n1                  
sigma1 = sum(y1*(x1-mean1)**2)/n1 
a1 = max(y1)
    
n2 = sum(y2)                         
mean2 = sum(x2*y2)/n2                  
sigma2 = sum(y2*(x2-mean2)**2)/n2
a2=max(y2)

print(mean1, sigma1, n1, a1)
print(mean2, sigma2, n2, a2)

In [39]:
chart_list = []
def gauss(x,mu,sigma,A):
    return A*np.exp(-(x-mu)**2/(2*sigma**2))

def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
    return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

for protein in ptnstd['Mutation'].unique():
    source = ptnstd[ptnstd['Mutation'] == protein]
    
    # create error bars
    err = alt.Chart(source).mark_errorbar(extent='stdev').encode(
        x='Acyl Chain Length:Q',
        y='Fraction:Q',
    )
    
    # create histogram
    bar = alt.Chart(source).mark_bar().encode(
        x='Acyl Chain Length:Q',
        y=alt.Y('mean(Fraction):Q').scale(domain=[0,.26]),
    ).properties(title=protein)
    
    # create gaussian
    x = ptns[(ptns['Acyl Chain Length'] % 2 == 0)]['Acyl Chain Length']
    y = ptns[protein].loc[x.index]
    
    # expected vals
    x1 = x[(x < 51) & (x >= 20)]
    y1 = ptns[protein].loc[x1.index]
    x2 = x[(x >= 51)]
    y2 = ptns[protein].loc[x2.index]
    
    n1 = sum(y1)
    mean1 = sum(x1*y1)/n1                  
    sigma1 = sum(y1*(x1-mean1)**2)/n1
    a1 = max(y1)
    
    n2 = sum(y2)                         
    mean2 = sum(x2*y2)/n2                  
    sigma2 = sum(y2*(x2-mean2)**2)/n2
    a2=max(y2)
    
    expected = [mean1, sigma1, a1, mean2, sigma2, a2]
    # print(protein, expected)
    
    popt,pcov = curve_fit(bimodal,x,y,p0=expected)
    
    xx = np.linspace(10, 60, 500)
    
    s = pd.DataFrame({'Acyl Chain Length':xx, 'y':bimodal(xx, *popt)})
    
    gaus = alt.Chart(s).mark_line(color='red').encode(
        x='Acyl Chain Length:Q',
        y='y:Q'
    )
    
    text1 = alt.Chart({'values':[{}]}).mark_text(
        align="left", baseline="top"
    ).encode(
        x=alt.value(5),  # pixels from left
        y=alt.value(5),  # pixels from top
        text=alt.value([f"mu1: {popt[0]:.3f}", f"sigma1: {popt[1]:.3f}", f"a1: {popt[2]:.3f}"])
    )
    
    text2 = alt.Chart({'values':[{}]}).mark_text(
        align="left", baseline="top"
    ).encode(
        x=alt.value(230),  # pixels from left
        y=alt.value(5),  # pixels from top
        text=alt.value([f"mu2: {popt[3]:.3f}", f"sigma2: {popt[4]:.3f}", f"a2: {popt[5]:.3f}"])
    )
    
    # append to chart list
    chart = bar + err + gaus + text1 + text2
    
    chart_list.append(chart)

bimodal_gauss = alt.hconcat(*chart_list[:3]) 
#& alt.hconcat(*chart_list[3:6]) & alt.hconcat(*chart_list[6:9]) & chart_list[9]
bimodal_gauss

# bimodal_gauss.save('graphs/pos_mode_whole_dataset/chain_length_graphs/updated_ChainLengthByProtein_SideBySide_BimodalGaussian_WithPS.png', dpi=350)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [None]:
# chart_list[0].save('graphs/presentation_figures/cas9_cl.png', ppi=300)

In [None]:
# chart_list[-1].save('graphs/presentation_figures/wt_cl.png', ppi=300)

In [None]:
# chart_list[2].save('graphs/presentation_figures/cavin_cl.png', ppi=300)

In [None]:
# chart_list[4].save('graphs/presentation_figures/cers1g_cl.png', ppi=300)

# Save Graphs

In [None]:
# side_by_side_err.save('graphs/pos_mode_whole_dataset/updated_ChainLengthByProtein_SideBySide_interactive.html')

# grouped_err.save('graphs/pos_mode_whole_dataset/chain_length_graphs/updated_ChainLengthByProtein_GroupedBarGraph_interactive.html')

# save layered density plot
# a.get_figure().savefig('graphs/pos_mode_whole_dataset/layered_density_plots.jpg')