# Analyze sex of patients from sex chromosome mapping counts
Import Python modules:

In [None]:
import altair as alt

import pandas as pd

Get variables from `snakemake`:

In [None]:
counts_files = snakemake.input.counts
stats_file = snakemake.output.stats
chart_file = snakemake.output.chart
descriptors = snakemake.params.descriptors
reported_sex = snakemake.params.reported_sex

assert len(counts_files) == len(descriptors)

Aggregate all the data into one data frame:

In [None]:
pd.concat([pd.read_csv(f).assign(**descriptor)
                      for f, descriptor in zip(counts_files, descriptors)])

In [None]:
sex_data = pd.concat([pd.read_csv(f).assign(**descriptor)
                      for f, descriptor in zip(counts_files, descriptors)])

assert set(sex_data['chromosome']) == {'X', 'Y'}

sex_data = (
    sex_data
    .drop(columns='sex')
    .pivot_table(index=[c for c in sex_data.columns
                        if c not in {'chromosome', 'count', 'sex'}],
                 columns='chromosome',
                 values='count',
                 )
    .reset_index() 
    .assign(reported_sex=lambda x: x['sample'].map(reported_sex),
            X_to_Y_ratio=lambda x: x['X'] / x['Y'])
    [['sample', 'reported_sex', 'host_genome', 'aligner', 'X', 'Y', 'X_to_Y_ratio']]
    )

print(f"Writing data to {stats_file}")
sex_data.to_csv(stats_file, index=False)

sex_data

Make an interactive Altair plot:

In [None]:
aligner_selection = alt.selection_single(
        name='read',
        fields=['aligner'],
        bind=alt.binding_select(options=sex_data['aligner'].unique()),
        init={'aligner': sex_data['aligner'].unique()[0]}
        )

host_genome_selection = alt.selection_single(
        name='reference',
        fields=['host_genome'],
        bind=alt.binding_select(options=sex_data['host_genome'].unique()),
        init={'host_genome': sex_data['host_genome'].unique()[0]}
        )

sample_selection = alt.selection_single(
        fields=['sample'],
        on='mouseover',
        empty='none'
        )

lower_bound = 10

chart = (
    alt.Chart(sex_data)
    .transform_fold(['X', 'Y'],
                    ['sex chromosome', 'reads mapped'],
                    )
    .encode(x=alt.X('reads mapped:Q',
                    scale=alt.Scale(type='log'),
                    axis=alt.Axis(grid=False),
                    ),
            y='sample:N',
            fill='sex chromosome:N',
            shape='reported_sex',
            size=alt.condition(sample_selection, alt.value(100), alt.value(75)),
            stroke=alt.condition(sample_selection, alt.value('black'), alt.value(None)),
            tooltip=['sample',
                     'reported_sex',
                     alt.Tooltip('X', title='X reads'),
                     alt.Tooltip('Y', title='Y reads'),
                     alt.Tooltip('X_to_Y_ratio',
                                 title='X to Y ratio',
                                 format='.2g')
                     ]
            )
    .mark_point()
    .transform_filter(alt.datum['reads mapped'] >= lower_bound)
    .add_selection(aligner_selection,
                   host_genome_selection,
                   sample_selection)
    .transform_filter(aligner_selection)
    .transform_filter(host_genome_selection)
    .properties(title=f"Reads mapping to each sex chromosome (counts <{lower_bound} clipped)")
    )

print(f"Saving chart to {chart_file}")
chart.save(chart_file)

chart