# Generating table for GEO upload with DiMSum outputs for all the datasets used in the study

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
pd.set_option('display.max_rows', 500)

## Read in DiMSum output files with all variants of doubles datasets

In [3]:
datasets = ['eLife_doubles','NNK_doubles','Nicking_doubles']

In [4]:
# produced and saved in notebook M0 in this directory

corrected_files_dir = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Doubles_datasets/dimsum_outputs/corrected_fitness_abundance_files_Anna/'

all_variants = {}

for dataset in datasets:
    print(dataset)
    all_variants[dataset] = pd.read_csv(corrected_files_dir + 'all_variants_corrected_' + dataset + '_full_table_for_plotting.tsv',
                                       index_col=0)

eLife_doubles
NNK_doubles
Nicking_doubles


  all_variants[dataset] = pd.read_csv(corrected_files_dir + 'all_variants_corrected_' + dataset + '_full_table_for_plotting.tsv',


In [5]:
len(all_variants['eLife_doubles'])

18649

In [6]:
len(all_variants['NNK_doubles'])

28530

In [7]:
len(all_variants['Nicking_doubles'])

54709

In [8]:
for dataset in datasets:
    print(dataset)
    print(np.unique([len(aa_seq) for aa_seq in all_variants[dataset].aa_seq]))

eLife_doubles
[42]
NNK_doubles
[15]
Nicking_doubles
[28]


## Read in DiMSum output files with all variants of combinatorial datasets

In [9]:
# DTS libraries/datasets we will be working with
libs_DTS = ['DTS01','DTS05']

lib_paths_dimsum = {}
lib_paths_dimsum['DTS01'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Combinatorial_DTS01_c-terminus/dimsum_min10/'
lib_paths_dimsum['DTS05'] = '/lustre/scratch126/gengen/projects/amyloid_beta_epistasis/Previous_files_from_Mireia/Combinatorial_DTS_c-terminus/dimsum/AB_DTS05/'


In [10]:
for lib in libs_DTS:
    print(lib)
    all_variants[lib] = pd.read_csv(lib_paths_dimsum[lib] + 'all_variants.csv',
                          index_col=0, keep_default_na=False)

DTS01
DTS05


In [11]:
all_variants['DTS01']

Unnamed: 0,nt_seq,aa_seq,Nham_nt,Nham_aa,Nmut_codons,WT,indel,STOP,STOP_readthrough,count_e1_s0,...,count_e3_s1,mean_count,fitness1_uncorr,fitness2_uncorr,fitness3_uncorr,sigma1_uncorr,sigma2_uncorr,sigma3_uncorr,fitness,sigma
1,,GAIIGIIIGGMLLA,,6,,,False,False,False,57,...,0,45.000000,-6.29053521179474,-7.14809196834481,,1.49279257546146,3.08419531769185,,-6.453303,1.343676
2,,GAIIGIIIGGVMMA,,5,,,False,False,False,490,...,7,372.666667,-6.57008715816327,-7.1750243660309,-6.56069181549137,0.875822329055295,1.33904356492612,1.06221836345588,-6.689845,0.603279
3,,GAIIGIIIGGVIMA,,5,,,False,False,False,33,...,0,29.500000,-6.43713868598662,,,1.97777649195475,,,-6.437139,1.977776
4,,GAIIGIIMGGFMFA,,5,,,False,False,False,55,...,5,112.333333,,-5.59153127700666,-6.44188278403944,,1.22839899062725,1.18020173448254,-6.033716,0.851058
5,,GAIIGIIVGGIILA,,5,,,False,False,False,23,...,18,16.500000,,,-2.67646016687167,,,1.38928292456688,-2.676460,1.389283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37667,,GALLGLLLGGVMFA,,5,,,False,False,False,139,...,16,159.000000,,-6.43112057193056,-5.57746890384693,,1.52928197462871,0.899488200209036,-5.796884,0.775319
37668,,GALLGLLLGGVLVA,,6,,,False,False,False,155,...,0,143.000000,-6.19229677221133,-7.52312808633473,,1.04206894651161,3.07632452941573,,-6.329283,0.986981
37669,,GALLGLLLGGVFIA,,6,,,False,False,False,81,...,5,91.000000,,-5.39147534169413,-5.8292317562616,,1.21333443870815,1.21026620028259,-5.610908,0.856870
37670,,GALLGLLLGGFIFA,,6,,,False,False,False,144,...,7,96.666667,-6.11868495486808,-8.02822603288643,-4.58625302029274,1.04393337020561,3.06911460776611,1.21782402854677,-5.629564,0.767410


In [12]:
for dataset in libs_DTS:
    print(dataset)
    print(np.unique([len(aa_seq) for aa_seq in all_variants[dataset].aa_seq]))

DTS01
[14]
DTS05
[31]


## Here it's not necessary to complemet variants here to full Abeta sequence, but I will add that column to help future users and to be able to mark fAD mutations

### Adding full Abeta sequence to tables here

In [13]:
AB_WT = "DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA"

In [14]:
# here we need to add WT N-terminus (aa1-27) to the beginning

all_variants['NNK_doubles']['aa_seq_full'] = [AB_WT[:27] + seq for seq in all_variants['NNK_doubles']['aa_seq']]

In [15]:
np.unique([len(seq) for seq in all_variants['NNK_doubles']['aa_seq_full']])

array([42])

In [16]:
# here we need to add WT C-terminus (aa29-42) to the end

all_variants['Nicking_doubles']['aa_seq_full'] = [seq + AB_WT[28:] for seq in all_variants['Nicking_doubles']['aa_seq']]

In [17]:
np.unique([len(seq) for seq in all_variants['Nicking_doubles']['aa_seq_full']])

array([42])

In [18]:
# for consistency

all_variants['eLife_doubles']['aa_seq_full'] = all_variants['eLife_doubles']['aa_seq']


In [19]:
np.unique([len(seq) for seq in all_variants['eLife_doubles']['aa_seq']])

array([42])

In [20]:
for lib in libs_DTS:
    print(lib)
    print(np.unique([len(elem) for elem in all_variants[lib]['aa_seq']], return_counts=True), '\n')

DTS01
(array([14]), array([37671])) 

DTS05
(array([31]), array([5283])) 



In [21]:
# complement DTS01
all_variants['DTS01']['aa_seq_full'] = [AB_WT[:28] + aa_seq for aa_seq in all_variants['DTS01']['aa_seq']]


In [22]:
# complement DTS05
all_variants['DTS05']['aa_seq_full'] = [AB_WT[:11] + aa_seq for aa_seq in all_variants['DTS05']['aa_seq']]


In [23]:
for lib in libs_DTS:
    print(lib)
    print(np.unique([len(elem) for elem in all_variants[lib]['aa_seq_full']], return_counts=True), '\n')

DTS01
(array([42]), array([37671])) 

DTS05
(array([42]), array([5283])) 



## Select relevant columns and merge datasets

In [24]:
datasets_all = datasets + libs_DTS

In [25]:
datasets_all

['eLife_doubles', 'NNK_doubles', 'Nicking_doubles', 'DTS01', 'DTS05']

In [26]:
all_variants['eLife_doubles'].columns

Index(['merge_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT',
       'STOP', 'STOP_readthrough', 'mean_count', 'count_e1_s0', 'count_e2_s0',
       'count_e3_s0', 'count_e1_s1', 'count_e2_s1', 'count_e3_s1',
       'fitness1_uncorr', 'sigma1_uncorr', 'fitness2_uncorr', 'sigma2_uncorr',
       'fitness3_uncorr', 'sigma3_uncorr', 'fitness', 'sigma', 'nscore_c',
       'sigma_norm_first_toWT', 'category_sigma', 'zscore_mode',
       'p.adjust_mode', 'category_dead', 'aa_seq_full'],
      dtype='object')

In [27]:
all_variants['Nicking_doubles'].columns

Index(['nt_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT', 'indel',
       'STOP', 'STOP_readthrough', 'mean_count', 'count_e1_s0', 'count_e2_s0',
       'count_e3_s0', 'count_e1_s1', 'count_e2_s1', 'count_e3_s1',
       'fitness1_uncorr', 'sigma1_uncorr', 'fitness2_uncorr', 'sigma2_uncorr',
       'fitness3_uncorr', 'sigma3_uncorr', 'fitness', 'sigma', 'aa_seq_full'],
      dtype='object')

In [28]:
all_variants['NNK_doubles'].columns

Index(['nt_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT', 'indel',
       'STOP', 'STOP_readthrough', 'mean_count', 'count_e1_s0', 'count_e2_s0',
       'count_e3_s0', 'count_e1_s1', 'count_e2_s1', 'count_e3_s1',
       'fitness1_uncorr', 'fitness2_uncorr', 'fitness3_uncorr',
       'sigma1_uncorr', 'sigma2_uncorr', 'sigma3_uncorr', 'fitness', 'sigma',
       'aa_seq_full'],
      dtype='object')

In [29]:
all_variants['DTS01'].columns

Index(['nt_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT', 'indel',
       'STOP', 'STOP_readthrough', 'count_e1_s0', 'count_e2_s0', 'count_e3_s0',
       'count_e1_s1', 'count_e2_s1', 'count_e3_s1', 'mean_count',
       'fitness1_uncorr', 'fitness2_uncorr', 'fitness3_uncorr',
       'sigma1_uncorr', 'sigma2_uncorr', 'sigma3_uncorr', 'fitness', 'sigma',
       'aa_seq_full'],
      dtype='object')

In [30]:
all_variants['DTS05'].columns

Index(['nt_seq', 'aa_seq', 'Nham_nt', 'Nham_aa', 'Nmut_codons', 'WT', 'indel',
       'STOP', 'STOP_readthrough', 'count_e1_s0', 'count_e2_s0', 'count_e3_s0',
       'count_e1_s1', 'count_e2_s1', 'count_e3_s1', 'mean_count',
       'fitness1_uncorr', 'fitness2_uncorr', 'fitness3_uncorr',
       'sigma1_uncorr', 'sigma2_uncorr', 'sigma3_uncorr', 'fitness', 'sigma',
       'aa_seq_full'],
      dtype='object')

In [31]:
# select only relevant columns and merge

columns_to_keep = ['aa_seq','mean_count','fitness','sigma','aa_seq_full']

geo_tables = {}

for dataset in datasets_all:
    print(dataset)
    geo_tables[dataset] = all_variants[dataset].loc[:,columns_to_keep].copy()
    
    if dataset == 'eLife_doubles':
        curr_dataset = 'shallow_double_mutants'
    else:
        curr_dataset = dataset
    
    geo_tables[dataset]['dataset'] = [curr_dataset for i in range(len(geo_tables[dataset]))]
    

eLife_doubles
NNK_doubles
Nicking_doubles
DTS01
DTS05


In [32]:
geo_table_merged = pd.concat(geo_tables.values())

In [33]:
geo_table_merged

Unnamed: 0,aa_seq,mean_count,fitness,sigma,aa_seq_full,dataset
1,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,210.500000,-0.117352,0.387033,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
2,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,28544.000000,0.352500,0.062247,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
3,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,170.000000,0.381524,0.420981,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
4,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,146.000000,0.052856,0.450957,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
5,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,64.500000,0.495394,0.711069,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
...,...,...,...,...,...,...
5279,VHHQKLVLLAEDLGSNKGALFGMMVGGVVIA,814.666667,-0.447388,1.620909,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALFGMMVGGVVIA,DTS05
5280,VHHQKLVLLAEDLGSNKGALFGLMVGGVVIA,696.333333,1.447367,0.953764,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALFGLMVGGVVIA,DTS05
5281,VHHQKLVLLAEDLGSNKGALLGVMVGGVVIA,2741.333333,0.034470,0.736629,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALLGVMVGGVVIA,DTS05
5282,VHHQKLVLLAEDLGSNKGALLGFMVGGVVIA,935.333333,-0.625840,1.737394,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALLGFMVGGVVIA,DTS05


In [34]:
geo_table_merged.reset_index(inplace=True, drop=True)

In [41]:
geo_table_merged['aa_seq'].value_counts()

aa_seq
DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA    2582
DAEFRHDSGYEVHHQKLVFFAEDVGSNK                    61
DAWFRHDSGYEVHHQKLVFFAEDVGGNK                     1
DAWFRHDSGYEVHHQKLVFFAEDVGHNK                     1
DAWFRHDSGYEVHHQKLVFFAEDVGPNK                     1
                                              ... 
DREFRHDSGYEVHHQKLVFFAEVVGSNK                     1
DREFRHDSGYEVHHQKLVFFAEGVGSNK                     1
DREFRHDSGYEVHHQKLVFFAEAVGSNK                     1
DREFRHDSGYEVHHQKLVFFAEDCGSNK                     1
VHHQKLVLLAEDLGSNKGALLGLMVGGVVIA                  1
Name: count, Length: 142201, dtype: int64

In [42]:
geo_table_merged[geo_table_merged['aa_seq'] == 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA']

Unnamed: 0,aa_seq,mean_count,fitness,sigma,aa_seq_full,dataset
424,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,140.500000,-0.254861,0.465475,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
425,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,303.666667,0.181283,0.265036,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
426,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,347.000000,-0.451204,0.262208,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
427,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,219.666667,-0.042756,0.312757,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
428,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,340.333333,-0.298518,0.253437,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
...,...,...,...,...,...,...
17748,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,153.000000,0.075942,0.452915,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
17749,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,126.000000,0.032686,0.402109,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
17750,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,465.333333,-0.082525,0.222729,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
17751,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,607.333333,-0.426131,0.201956,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants


In [44]:
np.unique(geo_table_merged[geo_table_merged['aa_seq'] == 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA']['dataset'])

array(['shallow_double_mutants'], dtype=object)

In [47]:
AB_WT == 'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA'

True

In [49]:
np.unique(geo_table_merged[geo_table_merged['aa_seq'] == 'DAEFRHDSGYEVHHQKLVFFAEDVGSNK']['dataset'])

array(['Nicking_doubles'], dtype=object)

In [84]:
fAD_mutations = ['H6R','D7H','D7N','E11K','K16Q','L17V','A21G','E22G','E22K','E22Q','D23N','L34V','A42T']

In [85]:
AB_WT

'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA'

In [86]:
fAD_mutations_full_aa_seqs = ['DAEFRRDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHHSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHNSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYKVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQQLVFFAEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKVVFFAEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFGEDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFAGDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFAKDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFAQDVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFAENVGSNKGAIIGLMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGVMVGGVVIA',
                              'DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIT'
                             ]

In [87]:
geo_table_merged

Unnamed: 0,aa_seq,mean_count,fitness,sigma,aa_seq_full,dataset
0,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,210.500000,-0.117352,0.387033,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
1,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,28544.000000,0.352500,0.062247,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
2,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,170.000000,0.381524,0.420981,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
3,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,146.000000,0.052856,0.450957,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
4,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,64.500000,0.495394,0.711069,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants
...,...,...,...,...,...,...
144837,VHHQKLVLLAEDLGSNKGALFGMMVGGVVIA,814.666667,-0.447388,1.620909,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALFGMMVGGVVIA,DTS05
144838,VHHQKLVLLAEDLGSNKGALFGLMVGGVVIA,696.333333,1.447367,0.953764,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALFGLMVGGVVIA,DTS05
144839,VHHQKLVLLAEDLGSNKGALLGVMVGGVVIA,2741.333333,0.034470,0.736629,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALLGVMVGGVVIA,DTS05
144840,VHHQKLVLLAEDLGSNKGALLGFMVGGVVIA,935.333333,-0.625840,1.737394,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALLGFMVGGVVIA,DTS05


In [88]:
geo_table_merged['fAD_category'] = ['non-fAD' for i in range(len(geo_table_merged))]


for idx in list(geo_table_merged.index):
    
    curr_aa_seq_full = geo_table_merged.loc[idx,'aa_seq_full']
    
    if curr_aa_seq_full in fAD_mutations_full_aa_seqs:
        geo_table_merged.loc[idx,'fAD_category'] = 'fAD'

In [89]:
geo_table_merged

Unnamed: 0,aa_seq,mean_count,fitness,sigma,aa_seq_full,dataset,fAD_category
0,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,210.500000,-0.117352,0.387033,KAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants,non-fAD
1,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,28544.000000,0.352500,0.062247,NAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants,non-fAD
2,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,170.000000,0.381524,0.420981,NTEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants,non-fAD
3,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,146.000000,0.052856,0.450957,NEEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants,non-fAD
4,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,64.500000,0.495394,0.711069,NAKFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,shallow_double_mutants,non-fAD
...,...,...,...,...,...,...,...
144837,VHHQKLVLLAEDLGSNKGALFGMMVGGVVIA,814.666667,-0.447388,1.620909,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALFGMMVGGVVIA,DTS05,non-fAD
144838,VHHQKLVLLAEDLGSNKGALFGLMVGGVVIA,696.333333,1.447367,0.953764,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALFGLMVGGVVIA,DTS05,non-fAD
144839,VHHQKLVLLAEDLGSNKGALLGVMVGGVVIA,2741.333333,0.034470,0.736629,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALLGVMVGGVVIA,DTS05,non-fAD
144840,VHHQKLVLLAEDLGSNKGALLGFMVGGVVIA,935.333333,-0.625840,1.737394,DAEFRHDSGYEVHHQKLVLLAEDLGSNKGALLGFMVGGVVIA,DTS05,non-fAD


In [90]:
geo_table_merged['fAD_category'].value_counts()

fAD_category
non-fAD    144814
fAD            28
Name: count, dtype: int64

In [92]:
geo_table_merged.to_csv("GEO_data_table_sheet.csv")