# Import Data

In [1]:
# import modules
import pandas as pd
import glob
import re

In [2]:
# read in all csv files from pos directory
pos_data = glob.glob("data/csv/pos/*.csv")
# print(pos_data)

df_list = []

for file in pos_data:
    chunk = pd.read_csv(file)
    df_list.append(chunk)
    
df = pd.concat(df_list, ignore_index=True)

df.head()

Unnamed: 0,Sample Name,PosMSMSALL-CAS9-A,PosMSMSALL-CAS9-A.1,PosMSMSALL-CAS9-B,PosMSMSALL-CAS9-B.1,PosMSMSALL-CAV_A,PosMSMSALL-CAV_A.1,PosMSMSALL-CAV_B,PosMSMSALL-CAV_B.1,PosMSMSALL-CAVIN_A,...,PosMSMSALL-SPTLC_B,PosMSMSALL-SPTLC_B.1,PosMSMSALL-UGCG-A,PosMSMSALL-UGCG-A.1,PosMSMSALL-UGCG-B,PosMSMSALL-UGCG-B.1,PosMSMSALL-WT_A,PosMSMSALL-WT_A.1,PosMSMSALL-WT_B,PosMSMSALL-WT_B.1
0,Hex2Cer 26:3;2 (LCB 18:0;2-2H2O),167.0,143.0,125.0,132.0,171.6091,182.9007,120.0,120.0,281.3523,...,130.0,142.0,68.0529,46.4729,88.0,91.0,229.5233,278.1255,206.0,182.0
1,Hex2Cer 26:2;2 (LCB 18:0;2-2H2O),53.2158,52.8015,47.4908,76.2783,27.6563,23.0569,9.0712,24.0712,0.0,...,0.0,2.1176,47.256,62.7063,11.1856,0.0,10.8768,6.0672,0.0,34.4747
2,Hex2Cer 26:2;2 (LCB 18:0;2-2H2O),167.8122,131.3388,220.0689,128.3439,342.976,331.4596,268.9979,289.4925,295.0869,...,290.0353,223.3067,187.9922,219.2679,231.3212,219.2373,329.9421,313.2847,251.0406,226.8066
3,Hex2Cer 26:2;2 (LCB 18:0;2-H2O),54.0,47.0,39.0,44.0,74.0,64.0,84.5341,88.7361,70.0,...,44.0,37.0,48.6404,51.9843,38.9995,52.0,62.0,57.0,51.0,40.0
4,Hex2Cer 26:0;2 (LCB 18:0;2-2H2O),391.224,326.4641,444.772,385.3202,175.0126,296.5953,238.6997,214.9442,215.1018,...,309.7805,272.9937,373.2111,292.2522,348.9607,348.3176,304.3199,271.4133,303.6544,253.3654


# Rename columns + get lipid metadata

In [3]:
# create dict to hold new column names
cols = {}

# create list to hold rows for metadata
row_list = []

for name in df.columns[1:]:
    # remove 'PosMSMSALL'
    n = name.replace('PosMSMSALL-','')
    cols[name] = n
    
    # split string to get protein
    p = re.split('-A|_A|-B|_B', n)
    # print(p[0])
    
    # create row for metadata
    row_list.append({'sample': n, 'protein': p[0]})

# rename df columns and create metada
df = df.rename(columns=cols)
df_meta_exps = pd.DataFrame(row_list)

In [4]:
# check df
df.head()

Unnamed: 0,Sample Name,CAS9-A,CAS9-A.1,CAS9-B,CAS9-B.1,CAV_A,CAV_A.1,CAV_B,CAV_B.1,CAVIN_A,...,SPTLC_B,SPTLC_B.1,UGCG-A,UGCG-A.1,UGCG-B,UGCG-B.1,WT_A,WT_A.1,WT_B,WT_B.1
0,Hex2Cer 26:3;2 (LCB 18:0;2-2H2O),167.0,143.0,125.0,132.0,171.6091,182.9007,120.0,120.0,281.3523,...,130.0,142.0,68.0529,46.4729,88.0,91.0,229.5233,278.1255,206.0,182.0
1,Hex2Cer 26:2;2 (LCB 18:0;2-2H2O),53.2158,52.8015,47.4908,76.2783,27.6563,23.0569,9.0712,24.0712,0.0,...,0.0,2.1176,47.256,62.7063,11.1856,0.0,10.8768,6.0672,0.0,34.4747
2,Hex2Cer 26:2;2 (LCB 18:0;2-2H2O),167.8122,131.3388,220.0689,128.3439,342.976,331.4596,268.9979,289.4925,295.0869,...,290.0353,223.3067,187.9922,219.2679,231.3212,219.2373,329.9421,313.2847,251.0406,226.8066
3,Hex2Cer 26:2;2 (LCB 18:0;2-H2O),54.0,47.0,39.0,44.0,74.0,64.0,84.5341,88.7361,70.0,...,44.0,37.0,48.6404,51.9843,38.9995,52.0,62.0,57.0,51.0,40.0
4,Hex2Cer 26:0;2 (LCB 18:0;2-2H2O),391.224,326.4641,444.772,385.3202,175.0126,296.5953,238.6997,214.9442,215.1018,...,309.7805,272.9937,373.2111,292.2522,348.9607,348.3176,304.3199,271.4133,303.6544,253.3654


In [5]:
# check exp metadata
df_meta_exps

Unnamed: 0,sample,protein
0,CAS9-A,CAS9
1,CAS9-A.1,CAS9
2,CAS9-B,CAS9
3,CAS9-B.1,CAS9
4,CAV_A,CAV
5,CAV_A.1,CAV
6,CAV_B,CAV
7,CAV_B.1,CAV
8,CAVIN_A,CAVIN
9,CAVIN_A.1,CAVIN


# Get Lipid Metadata

In [6]:
row_list = []

for name in df["Sample Name"]:
    # print(name)
    
    # split sample name string
    qual = re.split(' |:|;', name)
    # print(qual)
    
    # get head group, chain length, unsaturation
    head_group = qual[0]
    
    # get chain length
    chain_length = qual[1]
    if "-" in chain_length:
        c = chain_length.split(sep="-")
        chain_length = c[1]
        head_group += " " + c[0]
    chain_length = int(chain_length)
    
    # get unsaturation
    unsaturation = qual[2]
    if "+" in unsaturation:
        u = unsaturation.split(sep="+")
        unsaturation = u[0] 
    unsaturation = int(unsaturation)
    
    # create dict for row and then add to list of rows if not already in there
    row = {"Sample Name":name, 
           "Head Group":head_group, 
           "Acyl Chain Length":chain_length, 
           "Unsaturation":unsaturation}
    if row not in row_list:
        row_list.append(row)
    
    
df_meta_lipids = pd.DataFrame(row_list)
df_meta_lipids.sample(10)
# df_meta[df_meta["Sample Name"] == "DAG 29:3+NH4 (-FA 12:1 (NH4))"]

Unnamed: 0,Sample Name,Head Group,Acyl Chain Length,Unsaturation
4165,MADAG 52:8+NH4 (-FA 18:1 (NH4)),MADAG,52,8
3190,HexCer 34:1;4 (LCB 18:0;2-H2O),HexCer,34,1
3318,GM2 38:0;3 (LCB 18:0;2-H2O),GM2,38,0
1710,MMPE O-44:5 (-MMPE),MMPE O,44,5
4156,MADAG 51:2+NH4 (-FA 20:1 (NH4)),MADAG,51,2
3223,HexCer 42:2;2 (LCB 18:0;2-2H2O),HexCer,42,2
3607,GM1 34:0;2 (LCB 17:0;2-2H2O),GM1,34,0
4786,DAG 35:1+NH4 (-FA 18:1 (NH4)),DAG,35,1
4065,MADAG 50:8+NH4 (-FA 16:1 (NH4)),MADAG,50,8
4226,MADAG 54:10+NH4 (-FA 15:0 (NH4)),MADAG,54,10


# Save DataFrames

In [7]:
# save raw data (df), without index
df.to_csv('dataframes/pos_df.csv', index=False)

# save lipid metadata
df_meta_lipids.to_csv('dataframes/pos_lipids_df_meta.csv', index=False)

# save experiment metadata
df_meta_exps.to_csv('dataframes/pos_exps_df_meta.csv', index = False)