In [122]:
""" creates a by-sample breakdown of AA level mutations for three genes of interest;
    EGRF, BRAF and KRAS. goal here is to generate data for the top panel of figure 2 """

import re
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 999)

In [123]:
def char_strip(df):
    """ string strip all of the wierd dictionary characters, 
        and then rewriting egfr_muts """
    for idx, row in df.iterrows():
        cell = row.cell
        mutations = row.mutations
        new_list = []

        if not pd.isna(mutations):
            mutations = mutations.split(',')

            for j in range(len(mutations)):
                temp = re.sub(r'\W+', '', mutations[j])
                new_list.append(temp)

            df.mutations[idx] = new_list
    
    df = df.fillna(0)

    return(df)

In [124]:
def build_dict(df, gene, d):
    """ builds a by-sample dict for all the mutations found to a given gene """
    for idx, row in df.iterrows():
        curr_cell = row.cell
        mutations = row.mutations

        meta_row = meta[meta.cell == curr_cell]
        sample = list(meta_row.sample_name)[0]

        if mutations != 0:
            if sample in d:
                elm = d.get(sample)

                for mut in mutations:
                    mut = gene + ' ' + mut
                    if mut not in elm:
                        elm.append(mut)

                d.update({sample:elm})
            else:
                muts = []
                for elm in mutations:
                    elm = gene + ' ' + elm
                d.update({sample:muts})

    return(d)

In [125]:
def dict_to_csv(d):
    """ convert a dict to csv, even if dict values have uneven
        number of elements """
    for k in d.keys():
        v = d.get(k)
        curr_len = len(v)
        if curr_len < 22:
            to_add = 22 - curr_len
            v = np.append(v, np.repeat(np.nan, to_add))
            v = list(v)
            d.update({k:v})
            
    t = pd.DataFrame.from_dict(d)
    return(t)

In [128]:
meta = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/metadata_all_cells_4.10.19.csv')
meta = meta.rename(columns={'Unnamed: 0':'cell'})

egfr_muts = pd.read_csv('/Users/lincoln.harris/Desktop/EGFR_AA.csv', names=['cell', 'mutations'])
kras_muts = pd.read_csv('/Users/lincoln.harris/Desktop/KRAS_AA.csv', names=['cell', 'mutations'])
braf_muts = pd.read_csv('/Users/lincoln.harris/Desktop/BRAF_AA.csv', names=['cell', 'mutations'])

egfr_muts = char_strip(egfr_muts)
kras_muts = char_strip(kras_muts)
braf_muts = char_strip(braf_muts)

big_dict = {}
big_dict = build_dict(egfr_muts, 'EGFR', big_dict)
big_dict = build_dict(kras_muts, 'KRAS', big_dict)
big_dict = build_dict(braf_muts, 'BRAF', big_dict)

df = dict_to_csv(big_dict)
df.to_csv('top_panel_muts_by_sample.csv', index=False)

In [129]:
pd.read_csv('top_panel_muts_by_sample.csv')

Unnamed: 0,LT_S11,LT_S21,LT_S75,LT_S66,LT_S69,LT_S57,LT_S08,LT_S71,LT_S50,LT_S63,...,LT_S53,LT_S56,LT_S51,LT_S58,LT_S52,LT_S42,LT_S80,LT_S34,LT_S78,LT_S49
0,EGFR R521K,EGFR T903T,EGFR N158N,EGFR N158N,EGFR T903T,EGFR T629T,EGFR Q787Q,EGFR T903T,EGFR R521K,EGFR N158N,...,EGFR T903T,EGFR N158N,EGFR R521K,EGFR T903T,EGFR T903T,,EGFR A237Y,EGFR T903T,EGFR N158N,
1,EGFR K745_A750T,EGFR Q787Q,EGFR Q787Q,EGFR R521K,EGFR T629T,EGFR T903T,EGFR T629T,EGFR Q787Q,EGFR Q787Q,EGFR T903T,...,EGFR Q787Q,EGFR V300M,EGFR T903T,EGFR N158N,EGFR G42D,,EGFR T903T,EGFR Q787Q,EGFR Q787Q,
2,EGFR T629T,EGFR R521K,EGFR T903T,EGFR T629T,EGFR I1093M,EGFR Q787Q,EGFR R521K,EGFR K745_A750T,EGFR T903T,EGFR R1100S,...,EGFR D1014N,,EGFR Q787Q,EGFR Q787Q,EGFR Q787Q,,EGFR Q787Q,EGFR T629T,,
3,EGFR T903T,EGFR T629T,EGFR L387M,EGFR Q787Q,EGFR Q787Q,EGFR R521K,,EGFR L1034I,EGFR T629T,EGFR S442I,...,EGFR F856L,,EGFR T629T,KRAS Q61H,,,,,,
4,EGFR L833V,EGFR L858R,KRAS Q61H,EGFR T903T,EGFR S921R,EGFR N158N,,EGFR L1167V,EGFR I569I,EGFR L1167V,...,KRAS L19F,,EGFR N158N,,,,,,,
5,KRAS G13V,EGFR A237V,,EGFR V1142V,EGFR L1034I,EGFR K745_A750T,,EGFR A21A,,EGFR R521K,...,KRAS G13V,,,,,,,,,
6,BRAF W450L,EGFR A237Y,,EGFR G42D,EGFR K745_A750T,EGFR L1167V,,KRAS G13V,,EGFR V1142V,...,,,,,,,,,,
7,,EGFR V536M,,EGFR F856L,EGFR R1052I,EGFR G42D,,BRAF L89L,,EGFR R831H,...,,,,,,,,,,
8,,EGFR G42D,,EGFR D1014N,EGFR S811F,EGFR A21A,,,,EGFR G42D,...,,,,,,,,,,
9,,EGFR G598V,,EGFR L1167V,KRAS L19F,KRAS C118S,,,,KRAS C118S,...,,,,,,,,,,
