In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tad_helper_functions import parse_tad_annotations

In [None]:
sns.set_context('talk')

# Parameters

In [None]:
tads_fname = snakemake.input.tads_fname
db_fname = snakemake.input.db_fname
info_fname = snakemake.input.info_fname

source = snakemake.wildcards.source
tad_borders = snakemake.config['tad_borders']

db_out_fname = snakemake.output.db_fname
tad_length_plot = snakemake.output.tad_length_plot

# Load data

In [None]:
df = pd.read_csv(db_fname)
df.head()

In [None]:
df_info = pd.read_csv(info_fname, index_col=1)
df_info.head()

# Access SNP positions

In [None]:
genome_assembly = df_info.loc[source, 'genome_assembly']
genome_assembly

In [None]:
df_snppos = df[
    ['snpId', f'chromosome_{genome_assembly}', f'position_{genome_assembly}']
].copy()
df_snppos.rename(
    columns={
        f'chromosome_{genome_assembly}': 'chromosome',
        f'position_{genome_assembly}': 'position',
    },
    inplace=True,
)

df_snppos.drop_duplicates(subset=['snpId'], inplace=True)

df_snppos.head(5)

# Load TAD data

In [None]:
df_tads = pd.read_csv(tads_fname)

In [None]:
df_tads.head()

### TAD statistics

In [None]:
df_tads['length'] = df_tads['tad_stop'] - df_tads['tad_start']

In [None]:
sns.displot(
    data=df_tads[df_tads['length'] > 0], x='length', log_scale=True, element='step'
)

plt.xlabel('TAD length')
plt.ylabel('Count')

plt.tight_layout()
plt.savefig(tad_length_plot)

## Do work

In [None]:
def access_range_dict(row, dict_):
    range_dict_ = dict_.get(str(row['chromosome']), None)
    if range_dict_ is None:
        return 'undef'

    try:
        return range_dict_[row['position']]
    except KeyError:
        return 'outside'

In [None]:
for border_name, border_range in tad_borders.items():
    tad_anno = parse_tad_annotations(border_range, fname=tads_fname)
    df_snppos[border_name] = df_snppos.progress_apply(
        lambda x: access_range_dict(x, tad_anno), axis=1
    )

In [None]:
df_snptads = df_snppos.drop(['chromosome', 'position'], axis=1)
df_snptads.head()

Possible cell values:
* `tad`: SNP is in TAD body (i.e. not in border)
* `border`: SNP is in TAD border
* `undef`: chromosome that SNP is in has no TAD information available
* `outside`: SNP is outside of TAD

# Merge into input database

In [None]:
df_merged = df.merge(df_snptads, on='snpId')

In [None]:
df_merged.head()

# Save result

In [None]:
df_merged.to_csv(db_out_fname, index=False)