In [None]:
import os

import pandas as pd

from tqdm.auto import tqdm

# Parameters

In [None]:
fname_list = snakemake.input.fname_list

source = snakemake.wildcards.source

border_fraction_threshold = snakemake.config['parameters']['snp_majority_vote_border_fraction_threshold']
tad_borders = snakemake.config['tad_borders']

fname_output = snakemake.output.fname
fname_output_tads = snakemake.output.fname_tads

# Load data

In [None]:
df_list = []
for fname in tqdm(fname_list):
    tmp = pd.read_csv(fname)
    
    _, tad_source, window_size, _ = os.path.basename(fname).split('.')
#     tmp['window_size'] = window_size
    assert tad_source == source
    
    df_list.append(tmp)

df_all = pd.concat(df_list)
df_all.head()

# SNP Majority Vote

Notes:
* in a disease-SNP group, the `odds_ratio` can vary (because different studies reported different values)

In [None]:
%%time

variable_columns = {'odds_ratio'}

row_list = []
for idx, group in tqdm(df_all.groupby(['diseaseId', 'snpId'])):
    # make sure all other columns are equal for all grouped SNPs
    for col in set(group.columns) - set(tad_borders.keys()) - variable_columns:
        assert group[col].nunique() <= 1, group[col]  # can be 0 if all NaN
    
    # find majority SNP classification
    row = group.iloc[0].copy()

    for border_col in tad_borders.keys():
        counts = group[border_col].value_counts()

        if counts.get('border', 0) >= border_fraction_threshold * counts.sum():
            type_ = 'border'
        else:
            type_ = counts.idxmax()

        row[tad_borders] = type_
    
    row_list.append(row)

In [None]:
df_agg = pd.DataFrame(row_list)
df_agg.head()

# Inspect result

In [None]:
for fname in fname_list:
    tmp = pd.read_csv(fname)
    
    print(os.path.basename(fname))
    print(tmp['20in'].value_counts())
    print()

In [None]:
df_agg['20in'].value_counts()

# Save result

In [None]:
df_agg.to_csv(fname_output, index=False)

# Create dummy TAD file

This is needed for Snakemake DAG building.

In [None]:
pd.DataFrame({
    'chrname': [],
    'tad_stop': [],
    'tad_start': []
}).to_csv(fname_output_tads, index=False)