In [1]:
import pandas as pd

In [2]:
files = pd.read_csv('fq_well_and_plate.txt', sep=' ', names=['DNA_plate','DNA_well','filename'])
runs = pd.read_csv('output.csv', sep='\t')

In [3]:
# the files table currently contains 2 entries for each plate and well (R1 and R2)
# we group by plate and well and aggregate the two file paths as two independent columns
grp_files = files.groupby(['DNA_plate', 'DNA_well']).agg(['first', 'last'])
# convert the pandas group to a new dataframe
dedup_files = pd.DataFrame(grp_files.reset_index().as_matrix(), columns=['DNA_plate', 'DNA_well', 'r1_filename', 'r2_filename'])

In [4]:
# join the runs and fq files, using the runs table as the master (left join)
# to do this, we create indexes using DNA_plate and DNA_well for both tables
out = runs.set_index(['DNA_plate', 'DNA_well']).join(dedup_files.set_index(['DNA_plate', 'DNA_well']), how='left')

In [5]:
len(out)

920

In [6]:
out.to_csv('joined-left.csv', sep='\t')

In [7]:
out

Unnamed: 0_level_0,Unnamed: 1_level_0,*sample_name,sample_title,bioproject_accession,*organism,*collection_date,*env_biome,*env_feature,*env_material,*geo_loc_name,*host,...,samp_mat_process,samp_size,samp_store_temp,samp_vol_we_dna_ext,description,PigPen,Cohort,Dysentery,r1_filename,r2_filename
DNA_plate,DNA_well,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
P1,A1,Fe21/14194,,,Gut microflora metagenomics,2017-02-21 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,2D,,0.0,MON5838/MON5838_1/plate_1_A1_S1_R1_001.fastq.gz,MON5838/MON5838_1/plate_1_A1_S1_R2_001.fastq.gz
P1,A10,Fe14/29631,,,Gut microflora metagenomics,2017-02-14 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,1C,,0.0,MON5838/MON5838_73/plate_1_A10_S73_R1_001.fast...,MON5838/MON5838_73/plate_1_A10_S73_R2_001.fast...
P1,A11,Fe28/29707,,,Gut microflora metagenomics,2017-02-28 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,1A,,0.0,MON5838/MON5838_81/plate_1_A11_S81_R1_001.fast...,MON5838/MON5838_81/plate_1_A11_S81_R2_001.fast...
P1,A12,Fe21/14297,,,Gut microflora metagenomics,2017-02-21 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,3D,,0.0,MON5838/MON5838_89/plate_1_A12_S89_R1_001.fast...,MON5838/MON5838_89/plate_1_A12_S89_R2_001.fast...
P1,A2,Fe28/14286,,,Gut microflora metagenomics,2017-02-28 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,1B,,0.0,MON5838/MON5838_9/plate_1_A2_S9_R1_001.fastq.gz,MON5838/MON5838_9/plate_1_A2_S9_R2_001.fastq.gz
P1,A3,Fe21/29644,,,Gut microflora metagenomics,2017-02-21 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,2B,,0.0,MON5838/MON5838_17/plate_1_A3_S17_R1_001.fastq.gz,MON5838/MON5838_17/plate_1_A3_S17_R2_001.fastq.gz
P1,A4,Fe14/29898,,,Gut microflora metagenomics,2017-02-14 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,1C,,0.0,MON5838/MON5838_25/plate_1_A4_S25_R1_001.fastq.gz,MON5838/MON5838_25/plate_1_A4_S25_R2_001.fastq.gz
P1,A5,Fe21/29679,,,Gut microflora metagenomics,2017-02-21 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,3D,,0.0,MON5838/MON5838_33/plate_1_A5_S33_R1_001.fastq.gz,MON5838/MON5838_33/plate_1_A5_S33_R2_001.fastq.gz
P1,A6,Fe14/14284,,,Gut microflora metagenomics,2017-02-14 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,3C,,0.0,MON5838/MON5838_41/plate_1_A6_S41_R1_001.fastq.gz,MON5838/MON5838_41/plate_1_A6_S41_R2_001.fastq.gz
P1,A7,Fe14/29792,,,Gut microflora metagenomics,2017-02-14 00:00:00,Host- gut,Faecal,Faecal,"NSW, Australia",Porcine,...,,,-80C,,,2A,,0.0,MON5838/MON5838_49/plate_1_A7_S49_R1_001.fastq.gz,MON5838/MON5838_49/plate_1_A7_S49_R2_001.fastq.gz
