# Genotype data formatting

This module implements a collection of workflows used to format genotype data.

## Overview

The module streamlines conversion between PLINK and VCF formats (possibly more to add), specifically:

1. Conversion between VCF and PLINK formats
2. Split data (by specified input, by chromosomes, by genes)
3. Merge data (by specified input, by chromosomes)

## Input

Depending on the analysis task, input files are specified in one of the following formats:

1. A single Whole genome data in VCF format, or in PLINK bim/bed/fam bundle; Or,
2. A list of VCF or PLINK bed file
3. A singular column file containing a list of VCF or PLINK bed file
4. A two column file containing a list of per chromosome VCF or PLINK bed file where the first column is chrom and 2nd column is file name

## Output

Genotype data after reformatting.

## Examples

Minimal working example data-set as well as the singularity container `bioinfo.sif` can be downloaded from [Google Drive](https://drive.google.com/drive/u/0/folders/1ahIZGnmjcGwSd-BI91C9ayd_Ya8sB2ed).

### PLINK file merger

```
sos run genotype_formatting.ipynb merge_plink \
    --genoFile data/genotype/chr1.bed data/genotype/chr6.bed \
    --cwd output/genotype \
    --name chr1_chr6 \
    --container container/bioinfo.sif
```

...

## Command interface

In [1]:
sos run genotype_formatting.ipynb -h

usage: sos run genotype_formatting.ipynb
               [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  plink_to_vcf
  vcf_to_plink
  plink_by_gene
  plink_by_chrom
  merge_plink
  merge_vcf

Global Workflow Options:
  --cwd output (as path)
                        Work directory & output directory
  --container ''
                        The filename name for containers
  --job-size 1 (as int)
                        For cluster jobs, number commands to run per job
  --walltime 5h
                        Wall clock time expected
  --mem 3G
                        Memory expected
  --numThreads 20 (as int)
                        Number of threads
  --genoFile  paths

                

In [None]:
[global]
# Work directory & output directory
parameter: cwd = path("output")
# The filename name for containers
parameter: container = ''
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "3G"
# Number of threads
parameter: numThreads = 20
# the path to a bed file or VCF file, a vector of bed files or VCF files, or a text file listing the bed files or VCF files to process
parameter: genoFile = paths
# use this function to edit memory string for PLINK input
from sos.utils import expand_size
cwd = f"{cwd:a}"

import os
def get_genotype_file(geno_file_paths):
    #
    def valid_geno_file(x):
        suffixes = path(x).suffixes
        if suffixes[-1] == '.bed':
            return True
        if len(suffixes)>1 and ''.join(suffixes[-2:]) == ".vcf.gz":
            return True
        return False
    #
    def complete_geno_path(x, geno_file):
        if not valid_geno_file(x):
            raise ValueError(f"Genotype file {x} should be VCF (end with .vcf.gz) or PLINK bed file (end with .bed)")
        if not os.path.isfile(x):
            # relative path
            if not os.path.isfile(f'{geno_file:ad}/' + x):
                raise ValueError(f"Cannot find genotype file {x}")
            else:
                x = f'{geno_file:ad}/' + x
        return x
    # 
    def format_chrom(chrom):
        if chrom.startswith('chr'):
            chrom = chrom[3:]
        return chrom
    # Inputs are either VCF or bed, or a vector of them 
    if len(geno_file_paths) > 1:
        if all([valid_geno_file(x) for x in geno_file_paths]):
            return paths(geno_file_paths)
        else: 
            raise ValueError(f"Invalid input {geno_file_paths}")
    # Input is one genotype file or text list of genotype files
    geno_file = geno_file_paths[0]
    if valid_geno_file(geno_file):
        return paths(geno_file)
    else: 
        units = [x.strip().split() for x in open(geno_file).readlines() if x.strip() and not x.strip().startswith('#')]
        if all([len(x) == 1 for x in units]):
            return paths([complete_geno_path(x[0], geno_file) for x in units])
        elif all([len(x) == 2 for x in units]):
            genos = dict([(format_chrom(x[0]), path(complete_geno_path(x[1], geno_file))) for x in units])
        else:
            raise ValueError(f"{geno_file} should contain one column of file names, or two columns of chrom number and corresponding file name")
        return genos
                        
genoFile = get_genotype_file(genoFile)

## PLINK to VCF

In [1]:
[plink_to_vcf_1]
if isinstance(genoFile, dict):
    genoFile = genoFile.values()

input: genoFile, group_by = 1
output: f'{cwd}/{_input:bn}.vcf.gz'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
bash: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container, volumes = [f'{_input:ad}:{_input:ad}']
    plink --bfile ${_input:n} \
        --recode vcf-iid  \
        --out ${_output:nn} \
        --threads ${numThreads} \
        --memory ${int(expand_size(mem) * 0.9)/1e06} --output-chr chrMT

    bgzip -l 9 ${_output:n}
    tabix -f -p vcf ${_output}

## VCF to PLINK

In [None]:
[vcf_to_plink]
if isinstance(genoFile, dict):
    genoFile = genoFile.values()

input: genoFile, group_by = 1
output: f'{cwd}/{_input:nn}.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container = container, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    plink --vcf ${_input} \
        --vcf-half-call m \
        --vcf-require-gt \
        --allow-extra-chr \
        --make-bed --out ${_output:n} \
        --threads ${numThreads} \
        --memory ${int(expand_size(mem) * 0.9)/1e06}

## Split PLINK by genes

In [None]:
[plink_by_gene_1]
# cis window size
parameter: window = 500000
# Region definition
parameter: region_list = path
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
input: genoFile, for_each = 'regions'
output: f'{cwd}/{region_list:bn}_plink_files/{_input:bn}.{_regions[3]}.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container, volumes = [f'{region_list:ad}:{region_list:ad}']
    plink --bfile ${_input:an} \
        --make-bed \
        --out ${_output[0]:n} \
        --chr ${_regions[0]} \
        --from-bp ${f'1' if (int(_regions[1]) - window) < 0 else f'{(int(_regions[1]) - window)}'} \
        --to-bp ${int(_regions[2]) + window} \
        --allow-no-sex --output-chr chrMT || touch ${_output}

## Split PLINK by Chromosome

In [None]:
[plink_by_chrom_1]
stop_if(len(paths(genoFile))>1, msg = "This workflow expects one input genotype file.")
parameter: chrom = list
input: genoFile, for_each = "chrom"
output: f'{cwd}/{_input:bn}.{_chrom}.bed'
# look up for genotype file
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container, volumes = [f'{genoFile:ad}:{genoFile:ad}']
    ##### Get the locus genotypes for $[_chrom]
    plink --bfile $[_input:an] \
    --make-bed \
    --out $[_output[0]:n] \
    --chr $[_chrom] \
    --allow-no-sex  || true 

In [None]:
[plink_by_chrom_2, plink_by_gene_2]
input: group_by = "all"
output: f'{_input[0]:nn}.plink_files_list.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    import csv
    import pandas as pd 
    data_tempt = pd.DataFrame({
    "#id" : [x.split(".")[-2] for x in  [${_input:r,}]],
    "dir" : [${_input:r,}]
    })
    data_tempt.to_csv("${_output}",index = False,sep = "\t" )

In [None]:
[plink_to_vcf_2]
input: group_by = "all"
output: f'{_input[0]:nnn}.vcf_files_list.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
python: expand= "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    import csv
    import pandas as pd 
    data_tempt = pd.DataFrame({
    "#id" : [x.split(".")[-3] for x in  [${_input:r,}]],
    "dir" : [${_input:r,}]
    })
    data_tempt.to_csv("${_output}",index = False,sep = "\t" )

## Split VCF by Chromosome

**FIXME: add this as needed**

## Merge PLINK files

In [None]:
[merge_plink]
skip_if(len(genoFile) == 1)
# File prefix for the analysis output
parameter: name = str
input: genoFile, group_by = 'all'
output: f"{cwd}/{name}.merge_list", f"{cwd}/{name}.bed"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[1]:bn}'

with open(_output[0], 'w') as f:
    f.write('\n'.join([str(f'{x:n}') for x in _input[1:]]))

bash: container=container, expand= "${ }", stderr = f'{_output[1]:n}.stderr', stdout = f'{_output[1]:n}.stdout'
    plink \
    --bfile ${_input[0]:n} \
    --merge-list ${_output[0]} \
    --make-bed \
    --out ${_output[1]:n} \
    --threads ${numThreads} \
    --memory ${int(expand_size(mem) * 0.9)/1e06}

## Merge VCF files

In [None]:
[merge_vcf]
skip_if(len(genoFile) == 1)
# File prefix for the analysis output
parameter: name = str
input: genoFile, group_by = 'all'
output:  f"{cwd}/{name}.vcf.gz"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash: container=container, expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'
    bcftools concat -Oz ${_input} > ${_output}
    tabix -p vcf ${_output}