In [1]:
import os
import pandas as pd

os.chdir(os.path.realpath('viruses_alignment_stats'))

files = os.listdir()
print(files)

['amelRNA_01_abpv_stats.txt', 'amelRNA_01_alpv_stats.txt', 'amelRNA_01_beemlv_stats.txt', 'amelRNA_01_bqcv_stats.txt', 'amelRNA_01_bsrv_stats.txt', 'amelRNA_01_cbpv_stats.txt', 'amelRNA_01_dwv_stats.txt', 'amelRNA_01_iapv_stats.txt', 'amelRNA_01_kbv_stats.txt', 'amelRNA_01_kv_stats.txt', 'amelRNA_01_lsv_stats.txt', 'amelRNA_01_sbpv_stats.txt', 'amelRNA_01_sb_stats.txt', 'amelRNA_01_vdv1_stats.txt', 'amelRNA_02_abpv_stats.txt', 'amelRNA_02_alpv_stats.txt', 'amelRNA_02_beemlv_stats.txt', 'amelRNA_02_bqcv_stats.txt', 'amelRNA_02_bsrv_stats.txt', 'amelRNA_02_cbpv_stats.txt', 'amelRNA_02_dwv_stats.txt', 'amelRNA_02_iapv_stats.txt', 'amelRNA_02_kbv_stats.txt', 'amelRNA_02_kv_stats.txt', 'amelRNA_02_lsv_stats.txt', 'amelRNA_02_sbpv_stats.txt', 'amelRNA_02_sb_stats.txt', 'amelRNA_02_vdv1_stats.txt', 'amelRNA_03_abpv_stats.txt', 'amelRNA_03_alpv_stats.txt', 'amelRNA_03_beemlv_stats.txt', 'amelRNA_03_bqcv_stats.txt', 'amelRNA_03_bsrv_stats.txt', 'amelRNA_03_cbpv_stats.txt', 'amelRNA_03_dwv_stats

In [86]:
def get_sample_ids(filename_list):
    id = []
    for filename in filename_list: # looping through each line
        if filename[0:10] in id:
            continue
        else:
            id.append(filename[0:10])
    return id

sample_ids = get_sample_ids(files)

In [87]:
print(sample_ids)

['amelRNA_01', 'amelRNA_02', 'amelRNA_03']


In [84]:
def virus_name(filename_list):
    virus_id = []
    for filename in filename_list: 
        start = filename.index('_', filename.index('_')+1)
        filename_temp = filename[start+1:]
        end = filename_temp.index('_', filename_temp.index('_'))
        virus = filename_temp[:end]
        
        if virus in virus_id:
            continue
        else:
            virus_id.append(virus)
    return virus_id

viruses = virus_name(files)

In [85]:
print(viruses)

['abpv', 'alpv', 'beemlv', 'bqcv', 'bsrv', 'cbpv', 'dwv', 'iapv', 'kbv', 'kv', 'lsv', 'sbpv', 'sb', 'vdv1']


In [82]:
def reads_mapped_paired(file):
    for i in range(0, len(file)):
        if "reads mapped and paired" in file[i]:
            reads = file[i].split("\t")
    return reads[2]

def viral_read_dict(filename_list, sample_ids, virus_ids):
    sample_viral_reads = {}
    for sample in sample_ids:
        reads = []
        for filename in filename_list:
            if sample in filename:
                with open(filename, 'r') as f:
                    text = f.read().split("\n")

                reads.append(reads_mapped_paired(text))
                s = dict(zip(virus_ids,reads))
                sample_viral_reads.update({sample:s})
    return sample_viral_reads

viral_dict = viral_read_dict(files, sample_ids, viruses)

In [83]:
print(viral_dict)

{'amelRNA_01': {'abpv': '0', 'alpv': '0', 'beemlv': '0', 'bqcv': '402', 'bsrv': '0', 'cbpv': '0', 'dwv': '633142', 'iapv': '0', 'kbv': '0', 'kv': '949523', 'lsv': '0', 'sbpv': '0', 'sb': '0', 'vdv1': '760468'}, 'amelRNA_02': {'abpv': '0', 'alpv': '0', 'beemlv': '0', 'bqcv': '1710', 'bsrv': '0', 'cbpv': '0', 'dwv': '122516', 'iapv': '0', 'kbv': '0', 'kv': '242259', 'lsv': '0', 'sbpv': '0', 'sb': '0', 'vdv1': '384316'}, 'amelRNA_03': {'abpv': '0', 'alpv': '0', 'beemlv': '0', 'bqcv': '19924', 'bsrv': '0', 'cbpv': '0', 'dwv': '37310', 'iapv': '0', 'kbv': '0', 'kv': '7076', 'lsv': '0', 'sbpv': '0', 'sb': '0', 'vdv1': '48102'}}


In [81]:
viral_reads_df = pd.concat([pd.Series(sample_ids), pd.DataFrame.from_dict(viral_dict.values())], axis=1)
viral_reads_df = viral_reads_df.rename(columns={0: 'sample'}).set_index('sample')

In [77]:
viral_reads_df

Unnamed: 0_level_0,abpv,alpv,beemlv,bqcv,bsrv,cbpv,dwv,iapv,kbv,kv,lsv,sbpv,sb,vdv1
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
amelRNA_01,0,0,0,402,0,0,633142,0,0,949523,0,0,0,760468
amelRNA_02,0,0,0,1710,0,0,122516,0,0,242259,0,0,0,384316
amelRNA_03,0,0,0,19924,0,0,37310,0,0,7076,0,0,0,48102


In [73]:
viral_reads_df.to_csv("viral_reads.csv")