In [1]:
from pathlib import Path
import traceback
import pandas as pd
import numpy as np
import re

### Timing functions

In [2]:
"""
OBJECTIVE:
Investigate if it's better to directly concatenate dataframes, or to merge dataframes first
and select out columns and convert them to a series with pd.concat
"""

"\nOBJECTIVE:\nInvestigate if it's better to directly concatenate dataframes, or to merge dataframes first\nand select out columns and convert them to a series with pd.concat\n"

In [13]:
df1_list = [{"Chrom": "NW233434", 
                "GenomicModBase": 231, 
                "Rep1_A_BS": 4, 
                "Rep1_C_BS": 1, 
                "Rep1_G_BS": 1, 
                "Rep1_T_BS": 1, 
                "Rep1_TotalCoverage_BS": 2,
                "Rep1_DeletionRate_BS": 0.87}, 

                {"Chrom": "NW233438", 
                "GenomicModBase": 231, 
                "Rep1_A_BS": 4, 
                "Rep1_C_BS": 1, 
                "Rep1_G_BS": 1, 
                "Rep1_T_BS": 1, 
                "Rep1_TotalCoverage_BS": 5,
                "Rep1_DeletionRate_BS": 0.67},

                {"Chrom": "NW233435", 
                 "GenomicModBase": 331, 
                 "Rep1_A_BS": None, 
                 "Rep1_C_BS": None, 
                 "Rep1_G_BS": None, 
                 "Rep1_T_BS": None, 
                 "Rep1_TotalCoverage_BS": 3,
                 "Rep1_DeletionRate_BS": 0.4},

                {"Chrom": "NW233433", 
                "GenomicModBase": 230, 
                "Rep1_A_BS": 3, 
                "Rep1_C_BS": 1, 
                "Rep1_G_BS": 1, 
                "Rep1_T_BS": 1, 
                "Rep1_TotalCoverage_BS": 0,
                "Rep1_DeletionRate_BS": 0.2},
                
                {"Chrom": "NW233432", 
                "GenomicModBase": 229, 
                "Rep1_A_BS": 2, 
                "Rep1_C_BS": 6, 
                "Rep1_G_BS": 4, 
                "Rep1_T_BS": 7, 
                "Rep1_TotalCoverage_BS": 0,
                "Rep1_DeletionRate_BS": 0.93}]
df1 = pd.DataFrame(df1_list).drop_duplicates()

In [12]:
df2_list = [{"Chrom": "NW233434", 
            "GenomicModBase": 231, 
            "Rep2_A_BS": 4, 
            "Rep2_C_BS": 1, 
            "Rep2_G_BS": 1, 
            "Rep2_T_BS": 1, 
            "Rep2_TotalCoverage_BS": 77,
            "Rep2_DeletionRate_BS": 0.56}, 

            {"Chrom": "NW233438", 
             "GenomicModBase": 231, 
             "Rep2_A_BS": 4, 
             "Rep2_C_BS": 1, 
             "Rep2_G_BS": 1, 
             "Rep2_T_BS": 1, 
             "Rep2_TotalCoverage_BS": 88,
             "Rep2_DeletionRate_BS": 0.76},

            {"Chrom": "NW233435", 
             "GenomicModBase": 331, 
             "Rep2_A_BS": None, 
             "Rep2_C_BS": None, 
             "Rep2_G_BS": None, 
             "Rep2_T_BS": None, 
             "Rep2_TotalCoverage_BS": 99,
             "Rep2_DeletionRate_BS": 0.86},

            {"Chrom": "NW233433", 
             "GenomicModBase": 230, 
             "Rep2_A_BS": 3, 
             "Rep2_C_BS": 1, 
             "Rep2_G_BS": 1, 
             "Rep2_T_BS": 1, 
             "Rep2_TotalCoverage_BS": 100,
             "Rep2_DeletionRate_BS": 0.23},
            
            {"Chrom": "NW233432", 
             "GenomicModBase": 229, 
             "Rep2_A_BS": 2, 
             "Rep2_C_BS": 6, 
             "Rep2_G_BS": 4, 
             "Rep2_T_BS": 7, 
             "Rep2_TotalCoverage_BS": 40,
             "Rep2_DeletionRate_BS": 0.10}]
df2 = pd.DataFrame(df2_list).drop_duplicates()

In [17]:
df_list = [df1, df2]

In [None]:
%%timeit
def manual_concat(df_list):
    new_list = []
    for df in df_list:
        cov_col = [col for col in df.columns if re.search("_TotalCoverage_", col)]
        if cov_col:
            diff_cols = (df.columns.difference(cov_col, sort = False))
            df = df.drop(columns = diff_cols).rename(columns = {cov_col[0]: "TotalCoverage"})
            new_list.append(df)
    
    df_concat = pd.concat(new_list, ignore_index = True)
    return df_concat

manual_concat(df_list)

840 μs ± 287 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%%timeit

def merge_first(df1, df2):
    selected_cols = (df1.columns.tolist())[0:2]
    df_merged = pd.merge(df1, df2,
                         on = selected_cols,
                         how = "outer",
                         sort = False)
    cov_col = [col for col in df_merged.columns if re.search("_TotalCoverage_", col)]
    series_list = [df_merged[col] for col in cov_col]
    single_series = pd.concat(series_list, ignore_index = True)
    return single_series

merge_first(df1, df2)

1.58 ms ± 461 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Testing pre-processing function

In [None]:
name_dict = {"Rep1_TotalCoverage_BS": "TotalCoverage",
             "Rep1_DeletionRate_BS": "DeletionRate"}

df1 = df1.rename(columns = name_dict)

Unnamed: 0,Chrom,GenomicModBase,Rep1_A_BS,Rep1_C_BS,Rep1_G_BS,Rep1_T_BS,TotalCoverage,DeletionRate
0,NW233434,231,4.0,1.0,1.0,1.0,2,0.87
1,NW233438,231,4.0,1.0,1.0,1.0,5,0.67
2,NW233435,331,,,,,3,0.4
3,NW233433,230,3.0,1.0,1.0,1.0,0,0.2
4,NW233432,229,2.0,6.0,4.0,7.0,0,0.93


In [None]:
from itertools import chain
def preprocess_df(df_list, pattern_list):
    processed_list = []
    nested_list = []
    new_names = []

    for df in df_list:
        for pattern in pattern_list:
            new_names.append(pattern.strip("_"))
            match = [col for col in df.columns if re.search(pattern, col)]
            if match:
                nested_list.append(match)

        col_list = list(chain.from_iterable(nested_list))
        
        name_dict = dict(zip(col_list, new_names))
        df = df.rename(columns = name_dict)  
        processed_list.append(df)
    
    return processed_list

pattern_list = ["_TotalCoverage_", "_DeletionRate_"]
preprocess_df(df_list, pattern_list)

[      Chrom  GenomicModBase  Rep1_A_BS  Rep1_C_BS  Rep1_G_BS  Rep1_T_BS  \
 0  NW233434             231        4.0        1.0        1.0        1.0   
 1  NW233438             231        4.0        1.0        1.0        1.0   
 2  NW233435             331        NaN        NaN        NaN        NaN   
 3  NW233433             230        3.0        1.0        1.0        1.0   
 4  NW233432             229        2.0        6.0        4.0        7.0   
 
    TotalCoverage  DeletionRate  
 0              2          0.87  
 1              5          0.67  
 2              3          0.40  
 3              0          0.20  
 4              0          0.93  ,
       Chrom  GenomicModBase  Rep2_A_BS  Rep2_C_BS  Rep2_G_BS  Rep2_T_BS  \
 0  NW233434             231        4.0        1.0        1.0        1.0   
 1  NW233438             231        4.0        1.0        1.0        1.0   
 2  NW233435             331        NaN        NaN        NaN        NaN   
 3  NW233433             230     