# Likelihood Analysis of Parameter space
This notebook contains analysis regarding the Likelihood of various parameter combinations.

.) Do Maximum Likelihood analysis (with Nelder Mead)
.) Parallelize

In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

import os as os
import multiprocessing as mp
import sys as sys
import socket
import h5py  # Python Package to do the HDF5.

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0402.rcc.local":
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


### Need: Parallel, quick computation of likelihoods for target individuals
Problem: Bottleneck is I/O. I.e. load the 100 TSI individuals once - and then just update the parameters and recalculate

### Potential Plan:
Have the 100 TSI individuals in storage in Load Object. Have loading function there that provides the genotype Matrix of one of these Individuals (from storage)

### First try: Only do one individual

In [2]:
def prep_hmm_object(path_mosaic, exclude_pops, iid, prefix_out="", ch=3, n_ref=503):
    """Prepare and return a working HMM object"""
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5",
                      manual_load=True, save=False, save_fp=False, output=False)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.p_obj.set_prefix_out_data(prefix_out)
    hmm.p_obj.set_exclude_pops(pops=exclude_pops)

    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_emission_model()
    hmm.load_transition_model()
    hmm.set_diploid_observations()             # To diploidize Individuals
    return hmm

def ll_mosaic_individual(hmm, roh_in=1, roh_out=10, roh_jump=100):
    """Calculate the Log Likelihood of Mosaic Individual"""    
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    _, _, _, tot_ll = hmm.calc_posterior(save=False, full=True)
    return tot_ll

def ll_mosaic_individual(prms):
    """Calculate the log likelihood of Mosaic individual.
    ASSUME THAT HMM (hmm) IS INITIALIZED PROPERLY!!!"""
    roh_in, roh_out, roh_jump = prms
    print("\nParameters Current Step:")
    print(f"ROH In {roh_in:.3f}")
    print(f"ROH Out {roh_out:.3f}")
    print(f"ROH Jump: {roh_jump:.3f}")
    
    if np.min(prms)<0:   # If Parameter not feasible, penalize
        return np.inf
    
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump) # Set the Parameters
    _, _, _, tot_ll = hmm.calc_posterior(save = False, full = True)  # Calculate the LL
    
    print(f"LL: {tot_ll:.6f}")
    return -tot_ll  # Return Negetive one (for minimization)

### Load HMM Object

In [5]:
iid = "iid0"
ch=3
n_ref=503
path_mosaic = "./Simulated/1000G_Mosaic/TSI5/ch3_8cm/"
exclude_pops = ["TSI", ]

### Prepare HMM Object
hmm = prep_hmm_object(path_mosaic, exclude_pops=exclude_pops, iid = iid, ch=ch, n_ref=n_ref)


Loaded 77652 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/ch3_8cm/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference


In [24]:
### Test the log likelihood evaltions
#ll = ll_mosaic_individual(hmm, roh_in=1, roh_out=10, roh_jump=100)
ll = ll_mosaic_individual([1,10,100])
ll


Parameters Current Step:
ROH In 1.000
ROH Out 10.000
ROH Jump: 100.000
Reference Number: 792
Total Log likelihood: -24388.063
Likelihood: -24388.063309


24388.063309288642

# Optimize Single Individual Function

In [7]:
x0 = np.array([5, 10, 500])  # The Starting Value
res = minimize(ll_mosaic_individual, x0, method='nelder-mead', options={'fatol': 1e-2, 'disp': True})


Parameters Current Step:
ROH In 5.000
ROH Out 10.000
ROH Jump: 500.000
Reference Number: 792
Total Log likelihood: -23783.968
LL: -23783.968242

Parameters Current Step:
ROH In 5.250
ROH Out 10.000
ROH Jump: 500.000
Reference Number: 792
Total Log likelihood: -23776.304
LL: -23776.304274

Parameters Current Step:
ROH In 5.000
ROH Out 10.500
ROH Jump: 500.000
Reference Number: 792
Total Log likelihood: -23776.463
LL: -23776.462745

Parameters Current Step:
ROH In 5.000
ROH Out 10.000
ROH Jump: 525.000
Reference Number: 792
Total Log likelihood: -23779.428
LL: -23779.428025

Parameters Current Step:
ROH In 5.167
ROH Out 10.333
ROH Jump: 516.667
Reference Number: 792
Total Log likelihood: -23770.693
LL: -23770.692578

Parameters Current Step:
ROH In 5.250
ROH Out 10.500
ROH Jump: 525.000
Reference Number: 792
Total Log likelihood: -23764.245
LL: -23764.245117

Parameters Current Step:
ROH In 5.333
ROH Out 10.667
ROH Jump: 491.667
Reference Number: 792
Total Log likelihood: -23765.440
LL:

# Optimize LL for TSI copies
Idea: Have array of hmms [hmm, hmm1, ... ]. The LL Function updates the jump parameters, and calculates the total ll in parallel

### 1) Load HDF5

In [3]:
h5_path = "Simulated/1000G_Mosaic/TSI0/ch3/data.h5"  ### Which HDF to look into
f = h5py.File(h5_path, "r") # Load for Sanity Check. See below!
samples = f["samples"][:]

print(f"Loaded {len(samples)} targets")

Loaded 107 targets


### 2) Prepare HMM Object vector hmms

In [4]:
%%time
n_lls = 20 # How many Individuals to use from target
ch=3
n_ref=503
path_mosaic = "./Simulated/1000G_Mosaic/TSI0/ch3/"
exclude_pops = ["TSI", ]

### Load all TSI hmm workers
hmms = [prep_hmm_object(path_mosaic, exclude_pops=exclude_pops, iid=iid, ch=ch, n_ref=n_ref) for iid in samples[:n_lls]]


Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference

Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference

Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference

Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240ch

In [6]:
### Some Functions for Printing of output
org_print = sys.stdout
def blockPrint():
    """Block Printing to Console"""
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    """Reanable Printing to Console"""
    sys.stdout = org_print

### Likelihood Functions
def ll_worker(hmm, roh_in, roh_out, roh_jump):
    """Update Parameters, and calculate Likelihood for worker hmm"""
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump) # Set the Parameters
    _, _, _, ll = hmm.calc_posterior(save = False, full = True)  # Calculate the LL
    return ll

def ll_mosaic_individuals(prms):
    """Calculate the log likelihood of Mosaic individual.
    ASSUME THAT hmms [:] array IS INITIALIZED PROPERLY!!!"""
    roh_in, roh_out, roh_jump = prms
    print("\nParameters Current Step:")
    print(f"ROH In {roh_in:.3f}")
    print(f"ROH Out {roh_out:.3f}")
    print(f"ROH Jump: {roh_jump:.3f}")
    
    ### Return infinite value for infeasible params:
    if np.min(prms)<0:   
        return np.inf 
    
    ### Prepare the Parameter Array:
    arguments = [[hmm0, roh_in, roh_out, roh_jump] for hmm0 in hmms]
    
    
    ### Do the Multiprocessing
    blockPrint()
    ll_vec = multi_run(ll_worker, arguments, procses)
    enablePrint()
    
    tot_ll = np.sum(ll_vec)
    print(f"Mean LL: {np.mean(ll_vec)}")
    print(f"STD LL: {np.std(ll_vec)}")
    print(f"Sum LL: {tot_ll:.8f}")
    
    return -tot_ll  # Return Negetive one (for minimization)

def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)
    return results

In [None]:
%%time
### Run the Optimization (hmms array needs to be initialized!!)
procses = 10 ### How many processes in parallel 

x0 = np.array([2500, 2500, 200])  # The Starting Value (chosen according to single run)
res = minimize(ll_mosaic_individuals, x0, method='nelder-mead', options={'fatol': 1e-1, 'disp': True})


Parameters Current Step:
ROH In 2500.000
ROH Out 2500.000
ROH Jump: 200.000
Mean LL: -25580.235358344504
STD LL: 396.0469902531018
Sum LL: -511604.70716689

Parameters Current Step:
ROH In 2625.000
ROH Out 2500.000
ROH Jump: 200.000
Mean LL: -25579.499445558024
STD LL: 396.9173172558023
Sum LL: -511589.98891116

Parameters Current Step:
ROH In 2500.000
ROH Out 2625.000
ROH Jump: 200.000
Mean LL: -25581.46306689112
STD LL: 394.3694720260241
Sum LL: -511629.26133782

Parameters Current Step:
ROH In 2500.000
ROH Out 2500.000
ROH Jump: 210.000
Mean LL: -25580.192168559275
STD LL: 396.05219142731374
Sum LL: -511603.84337119

Parameters Current Step:
ROH In 2583.333
ROH Out 2375.000
ROH Jump: 206.667
Mean LL: -25581.34239427759
STD LL: 398.3653618777517
Sum LL: -511626.84788555

Parameters Current Step:
ROH In 2562.500
ROH Out 2437.500
ROH Jump: 205.000
Mean LL: -25580.162532089817
STD LL: 397.34677086970055
Sum LL: -511603.25064180

Parameters Current Step:
ROH In 2625.000
ROH Out 2458.333

# Area51

In [None]:
print("Done")