In [1]:
import pandas as pd
import numpy as np 
import os
from functools import reduce

In [9]:
def load_dataframes(path):
    filelist = [os.path.splitext(file)[0] for file in os.listdir(path) if file.endswith('.csv')]
    list_of_DFs = []
    for file in filelist: 
        DF = path + '/' + file + '.csv'
        file = pd.read_csv(DF, header=0, sep='\t', low_memory=False)
        file = file.drop(['Unnamed: 0', 'REF', 'FREQ', 'ALT', '% of population'], 1)
        list_of_DFs.append(file)
    SNVs = reduce(lambda x, y: pd.concat([x, y], axis=0), list_of_DFs)
    SNVs = SNVs.drop_duplicates(subset='SNP', keep="first")
    SNVs = SNVs[~SNVs.SNP.str.contains("_syn")]
    SNVs = SNVs.sort_values(['gene', 'POS'], ascending=[True, True])
    SNVs = SNVs.drop(['gene', 'POS'], 1)
    print(filelist)
    return(SNVs)

In [10]:
SNV_list = load_dataframes('/Users/katbraun/Documents/research/kat_braun/projects/H7N9_transmission-bottlenecks/H7N9-evolution-in-mammals/data_derived/avian_vs_mammalian/cleaned_csvs')


##

['GD3_ferret5_day5-cleaned', 'Anhui_ferret29_day3-cleaned', 'Anhui_ferret28_day5-cleaned', 'GD3_ferret7_day1-cleaned', 'Anhui_ferret25_day1-cleaned', 'GD3_ferret3_day1-cleaned', 'Anhui_ferret27_day5-cleaned', 'GD3_ferret1_day5-cleaned', 'Anhui_ferret31_day1-cleaned', 'GD3_ferret4_day5-cleaned', 'GD3_ferret4_day11-cleaned', 'GD3_ferret5_day3-cleaned', 'Anhui_ferret29_day5-cleaned', 'Anhui_ferret28_day3-cleaned', 'GD3_ferret7_day7-cleaned', 'GD3_ferret4_day9-cleaned', 'Anhui_ferret25_day7-cleaned', 'GD3_ferret3_day7-cleaned', 'Anhui_ferret27_day3-cleaned', 'GD3_ferret1_day3-cleaned', 'GD3_ferret5_day1-cleaned', 'GD3_ferret4_day7-cleaned', 'GD3_ferret7_day5-cleaned', 'Anhui_ferret29_day7-cleaned', 'GD3_ferret3_day5-cleaned', 'Anhui_ferret25_day5-cleaned', 'Anhui_ferret31_day5-cleaned', 'GD3_ferret1_day1-cleaned', 'Anhui_ferret27_day1-cleaned', 'GD3_ferret5_day7-cleaned', 'Anhui_ferret28_day7-cleaned', 'GD3_ferret7_day3-cleaned', 'Anhui_ferret29_day1-cleaned', 'GD3_ferret3_day3-cleaned', '

In [30]:
SNV_list.to_csv('/Users/katbraun/Documents/research/kat_braun/projects/H7N9_transmission-bottlenecks/H7N9-evolution-in-mammals/data_derived/avian_vs_mammalian/SNV_list.csv', sep=',')

##

In [13]:
## generate the average SNV frequency per SNV to include in this dataframe
## I need to include this to plot the y axis of figure 8

def load_dataframes_avgfreq(path):
    filelist = [os.path.splitext(file)[0] for file in os.listdir(path) if file.endswith('.csv')]
    list_of_DFs = []
    for file in filelist: 
        DF = path + '/' + file + '.csv'
        file = pd.read_csv(DF, header=0, sep='\t', low_memory=False)
        file = file.drop(['Unnamed: 0', 'REF', 'ALT', '% of population'], 1)
        list_of_DFs.append(file)
    SNVs = reduce(lambda x, y: pd.concat([x, y], axis=0), list_of_DFs)
#     SNVs = SNVs.drop_duplicates(subset='SNP', keep="first")
#     SNVs = SNVs[~SNVs.SNP.str.contains("_syn")]
    SNVs = SNVs.sort_values(['gene', 'POS'], ascending=[True, True])
    SNVs = SNVs.drop(['gene', 'POS'], 1)
    print(filelist)
    return(SNVs)

In [16]:
SNV_list_avg_freq = load_dataframes_avgfreq('/Users/katbraun/Documents/research/kat_braun/projects/H7N9_transmission-bottlenecks/H7N9-evolution-in-mammals/data_derived/avian_vs_mammalian/cleaned_csvs')
SNV_list_avg_freq['average_freq'] = SNV_list_avg_freq.groupby('SNP')['FREQ'].transform('mean')
SNV_list_avg_freq = SNV_list_avg_freq.drop_duplicates(subset='SNP', keep="first")

SNV_list_avg_freq

##

['GD3_ferret5_day5-cleaned', 'Anhui_ferret29_day3-cleaned', 'Anhui_ferret28_day5-cleaned', 'GD3_ferret7_day1-cleaned', 'Anhui_ferret25_day1-cleaned', 'GD3_ferret3_day1-cleaned', 'Anhui_ferret27_day5-cleaned', 'GD3_ferret1_day5-cleaned', 'Anhui_ferret31_day1-cleaned', 'GD3_ferret4_day5-cleaned', 'GD3_ferret4_day11-cleaned', 'GD3_ferret5_day3-cleaned', 'Anhui_ferret29_day5-cleaned', 'Anhui_ferret28_day3-cleaned', 'GD3_ferret7_day7-cleaned', 'GD3_ferret4_day9-cleaned', 'Anhui_ferret25_day7-cleaned', 'GD3_ferret3_day7-cleaned', 'Anhui_ferret27_day3-cleaned', 'GD3_ferret1_day3-cleaned', 'GD3_ferret5_day1-cleaned', 'GD3_ferret4_day7-cleaned', 'GD3_ferret7_day5-cleaned', 'Anhui_ferret29_day7-cleaned', 'GD3_ferret3_day5-cleaned', 'Anhui_ferret25_day5-cleaned', 'Anhui_ferret31_day5-cleaned', 'GD3_ferret1_day1-cleaned', 'Anhui_ferret27_day1-cleaned', 'GD3_ferret5_day7-cleaned', 'Anhui_ferret28_day7-cleaned', 'GD3_ferret7_day3-cleaned', 'Anhui_ferret29_day1-cleaned', 'GD3_ferret3_day3-cleaned', '

Unnamed: 0,FREQ,SNP,average_freq
46,0.0156,HA_C57T_syn,0.015600
55,0.1280,HA_C96T_syn,0.078950
55,0.0140,HA_C140T_A47V,0.014000
28,0.0194,HA_G194A_R65K,0.017950
56,0.0556,HA_C236T_T79I,0.050867
54,0.0155,HA_G384A_syn,0.019850
34,0.8810,HA_G410A_G137E,0.799650
43,0.0130,HA_G416T_R139I,0.013400
57,0.0274,HA_G416A_R139K,0.032960
58,0.0976,HA_C419T_T140I,0.065767


In [24]:
avian_or_mammalian = pd.read_csv('avian_mammalian_SNVs_for_pandas.csv', header=0, sep=',', low_memory=False)
avian_or_mammalian = avian_or_mammalian.rename(columns={'SNPs': 'SNP'})

In [25]:
avian_or_mammalian_final = pd.merge(SNV_list_avg_freq, avian_or_mammalian, on='SNP')
# avian_or_mammalian_final

Unnamed: 0,FREQ,SNP,average_freq,avian_or_mammalian
0,0.0140,HA_C140T_A47V,0.014000,not_found
1,0.0194,HA_G194A_R65K,0.017950,avian
2,0.0556,HA_C236T_T79I,0.050867,mammalian
3,0.8810,HA_G410A_G137E,0.799650,avian
4,0.0130,HA_G416T_R139I,0.013400,not_found
5,0.0274,HA_G416A_R139K,0.032960,avian
6,0.0976,HA_C419T_T140I,0.065767,avian
7,0.2052,HA_A421G_N141D,0.363294,mammalian
8,0.2636,HA_G427A_A143T,0.153429,mammalian
9,0.0167,HA_A442G_R148G,0.016700,mammalian


In [26]:
avian_or_mammalian_final.to_csv('avian_or_mammalian_with_avg_frequency.csv')