# GENE REPRESENTATIONS FOR DEEP GWAS

## Workspace Setup

### Packages

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
import torch as pt

### Global Variables

### Functions

In [34]:
def create_samples(n,nc,p,l,m,i):
    '''Creates a set of artificial-genotype samples for one single gene across multiple samples of a population.
    
    parameters
    ----------
    n: `int`
        number of samples
    nc: `int`
        number of causal SNPs
    p: `float`
        prevalence (i.e., minor-class ratio) in [0,1]
    l: `int`
        gene length in terms of SNPs (rather than bp),
    m: `int`
        maximum number of (minor-allele) SNPs present in a sample
    i: `bool`
        whether or not SNP interaction is required to produce the minor-class phenotype
        
    returns
    -------
    X: `set`
        set samples w/ desired prevalence of minor class
    C: `set`
        set of causal-SNP identifiers
    '''
    # create a the set of SNP identifiers
    S = list(np.arange(1,l+1))
    
    # random choice of causal SNPs
    C = list(np.random.choice(S,size=nc,replace=False))
    
    # define set of non-causal SNPs
    D = [s for s in S if s not in C]
    
    # calculate number of positive/negative samples required
    n_pos = int(p*n)
    n_neg = n - n_pos
    
    # create positive and negative samples w/ or w/o interaction
    if i:
        P = pos_samples_and(n_pos,m,C,D)
        N = neg_samples_and(n_neg,m,C,D)
    else:
        P = pos_samples_or(n_pos,m,C,D)
        N = neg_samples_or(n_neg,m,D)
        
    # create entire set of samples as union of positive and negative samples
    X = P+N
    
    return X,C

def pos_samples_and(n_pos,m,C,D):
    '''Creates a set of minor-class artificial-genotype samples for one single gene under SNP interaction.
    
    parameters
    ----------
    n_pos: `int`
        number of positive samples
    m: `int`
        maximum number of SNPs present
    C: `set`
        set of causal SNPs
    D: `set`
        set of non-causal SNPs
        
    returns
    -------
    P: `set`
        set minor-class samples
    '''
    # determine number of causal SNPs
    nc = len(C)
    
    # initialize P as empty set
    P = []
    
    # loop over minor-class samples to be created
    for k in range(n_pos):
        
        # random choice of number of non-causal SNPs present
        nk = np.random.randint(low=1,high=m-nc+1)
        
        # random choice of non-causal SNPs present
        Dk = list(np.random.choice(D,size=nk,replace=False))
        
        # create sample as union of all causal and chosen non-causal SNPs
        Pk = C+Dk
        
        # add sample to list of samples
        P = P+[Pk]
    
    return P

def neg_samples_and(n_neg,m,C,D):
    '''Creates a set of major-class artificial-genotype samples for one single gene under SNP interaction.
    
    parameters
    ----------
    n_neg: `int`
        number of negative samples
    m: `int`
        maximum number of SNPs present
    C: `set`
        set of causal SNPs
    D: `set`
        set of non-causal SNPs
        
    returns
    -------
    N: `set`
        set major-class samples
    '''
    # determine number of causal SNPs
    nc = len(C)
    
    # random choice of number of causal SNPs present (maximally all but one)
    s = np.random.randint(low=0,high=nc,size=n_neg)
    
    # initialize N as empty set
    N = []
    
    # loop over minor-class samples to be created
    for k in range(n_neg):
        
        # random choice of number of non-causal SNPs present
        nk = np.random.randint(low=1,high=m-s[k]+1)
        
        # random choice of causal SNPs present
        Ck = list(np.random.choice(C,size=s[k],replace=False))
        
        # random choice of non-causal SNPs present
        Dk = list(np.random.choice(D,size=nk,replace=False))
        
        # create sample as union of all causal and chosen non-causal SNPs
        Nk = Ck+Dk
        
        # add sample to list of samples
        N = N+[Nk]
    
    return N

def pos_samples_or(n_pos,m,C,D):
    '''Creates a set of minor-class artificial-genotype samples for one single gene w/o SNP interaction.
    
    parameters
    ----------
    n_pos: `int`
        number of positive samples
    m: `int`
        maximum number of SNPs present
    C: `set`
        set of causal SNPs
    D: `set`
        set of non-causal SNPs
        
    returns
    -------
    P: `set`
        set minor-class samples
    '''
    # determine number of causal SNPs
    nc = len(C)
    
    # random choice of number of causal SNPs present (minimally one)
    s = np.random.randint(low=1,high=nc+1,size=n_pos)
    
    # initialize P as empty set
    P = []
    
    # loop over minor-class samples to be created
    for k in range(n_pos):
        
        # random choice of number of non-causal SNPs present
        nk = np.random.randint(low=1,high=m-s[k]+1)
        
        # random choice of causal SNPs present
        Ck = list(np.random.choice(C,size=s[k],replace=False))
        
        # random choice of non-causal SNPs present
        Dk = list(np.random.choice(D,size=nk,replace=False))
        
        # create sample as union of all causal and chosen non-causal SNPs
        Pk = Ck+Dk
        
        # add sample to list of samples
        P = P+[Pk]
    
    return P

def neg_samples_or(n_neg,m,D):
    '''Creates a set of major-class artificial-genotype samples for one single gene w/o SNP interaction.
    
    parameters
    ----------
    n_neg: `int`
        number of negative samples
    m: `int`
        maximum number of SNPs present
    D: `set`
        set of non-causal SNPs
        
    returns
    -------
    N: `set`
        set major-class samples
    '''
    # random choice of number of non-causal SNPs present
    s = np.random.randint(low=1,high=m+1,size=n_neg)
    
    # initialize N as empty set
    N = []
    
    # loop over minor-class samples to be created
    for k in range(n_neg):
        
        # random choice of non-causal SNPs present
        Nk = list(np.random.choice(D,size=s[k],replace=False))
        
        # add sample to list of samples
        N = N+[Nk]
    
    return N

def assign_phenotype(s,C,i):
    '''Assigns class label based on a single-gene sample and ground-truth of causal SNPs and interaction.
    
    parameters
    ----------
    s: `set`
        set of present (minor-allele) SNPs
    C: `set`
        set of causal-SNP identifiers
    i: `bool`
        whether or not SNP interaction is required to produce the minor-class phenotype
        
    returns
    -------
    p: `int`
        binary phenotype, 1/0 for minor/major class
    '''
    # create list of causal SNPs present
    l = [c for c in s if c in C]
    
    # in case of required SNP interaction
    if i:
        
        # if all causal SNPs are present 
        if len(l)==len(C):
            # sample is positive
            p = 1
        else:
            p = 0
            
    # in case of no SNP inetraction    
    else:
        
        # if at least one causal SNPs present 
        if len(l)>=1:
            # sample is positive
            p = 1
        else:
            p = 0
            
    return p

## Data Generation

In [36]:
n  = 100
nc = 3
p  = .35
l  = 20
m  = 20-nc
i  = False

# create sample dictionary
X,C = create_samples(n,nc,p,l,m,i)

sample_dict = {}
for j,s in enumerate(X):
    sample_dict[j+1] = s
    
# label samples
label_dict = {}
for s in sample_dict:
    label_dict[s] = assign_phenotype(sample_dict[s],C,i)