# Likelihood Analysis of Parameter space
This notebook contains analysis regarding the Likelihood of various parameter combinations.

.) Do Maximum Likelihood analysis (with Nelder Mead)
.) Parallelize

In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

import os as os
import multiprocessing as mp
import sys as sys
import socket
import h5py  # Python Package to do the HDF5.

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0402.rcc.local":
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

/home/harald/git/HAPSBURG
CPU Count: 4


### Need: Parallel, quick computation of likelihoods for target individuals
Problem: Bottleneck is I/O. I.e. load the 100 TSI individuals once - and then just update the parameters and recalculate

### Potential Plan:
Have the 100 TSI individuals in storage in Load Object. Have loading function there that provides the genotype Matrix of one of these Individuals (from storage)

### First try: Only do one individual

In [4]:
def prep_hmm_object(path_mosaic, exclude_pops, iid, prefix_out="", ch=3, n_ref=503):
    """Prepare and return a working HMM object"""
    
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5",
                      manual_load=True, save=False, save_fp=False, output=False)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.p_obj.set_prefix_out_data(prefix_out)
    hmm.p_obj.set_exclude_pops(pops=exclude_pops)

    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_emission_model()
    hmm.load_transition_model()
    hmm.set_diploid_observations()             # To diploidize Individuals
    return hmm

def ll_mosaic_individual(hmm, roh_in=1, roh_out=10, roh_jump=100):
    """Calculate the Log Likelihood of Mosaic Individual"""    
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    _, _, _, tot_ll = hmm.calc_posterior(save=False, full=True)
    return tot_ll

def ll_mosaic_individual(prms):
    """Calculate the log likelihood of Mosaic individual.
    ASSUME THAT HMM (hmm) IS INITIALIZED PROPERLY!!!"""
    roh_in, roh_out, roh_jump = prms
    print("\nParameters Current Step:")
    print(f"ROH In {roh_in:.3f}")
    print(f"ROH Out {roh_out:.3f}")
    print(f"ROH Jump: {roh_jump:.3f}")
    
    if np.min(prms)<0:   # If Parameter not feasible, penalize
        return np.inf
    
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump) # Set the Parameters
    _, _, _, tot_ll = hmm.calc_posterior(save = False, full = True)  # Calculate the LL
    
    print(f"LL: {tot_ll:.6f}")
    return -tot_ll  # Return Negetive one (for minimization)

### Load HMM Object

In [5]:
iid = "iid0"
ch=3
n_ref=503
path_mosaic = "./Simulated/1000G_Mosaic/TSI5/ch3_8cm/"
exclude_pops = ["TSI", ]

### Prepare HMM Object
hmm = prep_hmm_object(path_mosaic, exclude_pops=exclude_pops, iid = iid, ch=ch, n_ref=n_ref)


Loaded 77652 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/ch3_8cm/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference


In [24]:
### Test the log likelihood evaltions
#ll = ll_mosaic_individual(hmm, roh_in=1, roh_out=10, roh_jump=100)
ll = ll_mosaic_individual([1,10,100])
ll


Parameters Current Step:
ROH In 1.000
ROH Out 10.000
ROH Jump: 100.000
Reference Number: 792
Total Log likelihood: -24388.063
Likelihood: -24388.063309


24388.063309288642

# Optimize Function

In [7]:
x0 = np.array([5, 10, 500])  # The Starting Value

res = minimize(ll_mosaic_individual, x0, method='nelder-mead', options={'fatol': 1e-2, 'disp': True})


Parameters Current Step:
ROH In 5.000
ROH Out 10.000
ROH Jump: 500.000
Reference Number: 792
Total Log likelihood: -23783.968
LL: -23783.968242

Parameters Current Step:
ROH In 5.250
ROH Out 10.000
ROH Jump: 500.000
Reference Number: 792
Total Log likelihood: -23776.304
LL: -23776.304274

Parameters Current Step:
ROH In 5.000
ROH Out 10.500
ROH Jump: 500.000
Reference Number: 792
Total Log likelihood: -23776.463
LL: -23776.462745

Parameters Current Step:
ROH In 5.000
ROH Out 10.000
ROH Jump: 525.000
Reference Number: 792
Total Log likelihood: -23779.428
LL: -23779.428025

Parameters Current Step:
ROH In 5.167
ROH Out 10.333
ROH Jump: 516.667
Reference Number: 792
Total Log likelihood: -23770.693
LL: -23770.692578

Parameters Current Step:
ROH In 5.250
ROH Out 10.500
ROH Jump: 525.000
Reference Number: 792
Total Log likelihood: -23764.245
LL: -23764.245117

Parameters Current Step:
ROH In 5.333
ROH Out 10.667
ROH Jump: 491.667
Reference Number: 792
Total Log likelihood: -23765.440
LL:

LL: -22821.697702

Parameters Current Step:
ROH In 159.924
ROH Out 222.222
ROH Jump: 2317.673
Reference Number: 792
Total Log likelihood: -22665.567
LL: -22665.566555

Parameters Current Step:
ROH In 188.738
ROH Out 265.453
ROH Jump: 2260.768
Reference Number: 792
Total Log likelihood: -22600.163
LL: -22600.163117

Parameters Current Step:
ROH In 149.592
ROH Out 229.212
ROH Jump: 996.273
Reference Number: 792
Total Log likelihood: -22535.809
LL: -22535.808827

Parameters Current Step:
ROH In 152.852
ROH Out 252.096
ROH Jump: -83.608

Parameters Current Step:
ROH In 182.975
ROH Out 261.485
ROH Jump: 1407.884
Reference Number: 792
Total Log likelihood: -22507.334
LL: -22507.333871

Parameters Current Step:
ROH In 211.637
ROH Out 301.033
ROH Jump: 1089.243
Reference Number: 792
Total Log likelihood: -22428.216
LL: -22428.215504

Parameters Current Step:
ROH In 242.037
ROH Out 359.324
ROH Jump: 974.992
Reference Number: 792
Total Log likelihood: -22372.014
LL: -22372.013782

Parameters Cur

Reference Number: 792
Total Log likelihood: -22095.552
LL: -22095.551570

Parameters Current Step:
ROH In 1021.845
ROH Out 1286.561
ROH Jump: 385.607
Reference Number: 792
Total Log likelihood: -22082.431
LL: -22082.430969

Parameters Current Step:
ROH In 1075.545
ROH Out 1304.592
ROH Jump: 396.317
Reference Number: 792
Total Log likelihood: -22074.947
LL: -22074.947095

Parameters Current Step:
ROH In 984.676
ROH Out 1145.005
ROH Jump: 639.242
Reference Number: 792
Total Log likelihood: -22082.465
LL: -22082.464638

Parameters Current Step:
ROH In 972.697
ROH Out 1046.610
ROH Jump: 608.851
Reference Number: 792
Total Log likelihood: -22070.341
LL: -22070.341031

Parameters Current Step:
ROH In 958.161
ROH Out 904.274
ROH Jump: 708.854
Reference Number: 792
Total Log likelihood: -22069.378
LL: -22069.377908

Parameters Current Step:
ROH In 1110.774
ROH Out 1118.674
ROH Jump: 671.956
Reference Number: 792
Total Log likelihood: -22061.811
LL: -22061.811211

Parameters Current Step:
ROH I

Total Log likelihood: -22014.157
LL: -22014.157190

Parameters Current Step:
ROH In 1824.510
ROH Out 1307.112
ROH Jump: 242.519
Reference Number: 792
Total Log likelihood: -22014.167
LL: -22014.166840

Parameters Current Step:
ROH In 1831.349
ROH Out 1313.944
ROH Jump: 248.636
Reference Number: 792
Total Log likelihood: -22014.159
LL: -22014.158601

Parameters Current Step:
ROH In 1831.235
ROH Out 1310.546
ROH Jump: 252.549
Reference Number: 792
Total Log likelihood: -22014.159
LL: -22014.159448

Parameters Current Step:
ROH In 1830.597
ROH Out 1311.222
ROH Jump: 250.372
Reference Number: 792
Total Log likelihood: -22014.157
LL: -22014.157085

Parameters Current Step:
ROH In 1828.066
ROH Out 1308.037
ROH Jump: 248.912
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.156124

Parameters Current Step:
ROH In 1826.425
ROH Out 1305.084
ROH Jump: 249.050
Reference Number: 792
Total Log likelihood: -22014.158
LL: -22014.158448

Parameters Current Step:
ROH In 1832.806
ROH Out

Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155809

Parameters Current Step:
ROH In 1829.859
ROH Out 1309.764
ROH Jump: 248.547
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155809

Parameters Current Step:
ROH In 1829.937
ROH Out 1309.839
ROH Jump: 248.556
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155809

Parameters Current Step:
ROH In 1830.012
ROH Out 1309.867
ROH Jump: 248.530
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155809

Parameters Current Step:
ROH In 1829.899
ROH Out 1309.811
ROH Jump: 248.559
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.898
ROH Out 1309.809
ROH Jump: 248.595
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155809

Parameters Current Step:
ROH In 1829.933
ROH Out 1309.828
ROH Jump: 248.543
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
R

Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.920
ROH Out 1309.821
ROH Jump: 248.552
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.920
ROH Out 1309.821
ROH Jump: 248.552
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.920
ROH Out 1309.821
ROH Jump: 248.552
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.920
ROH Out 1309.821
ROH Jump: 248.552
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.920
ROH Out 1309.821
ROH Jump: 248.552
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
ROH In 1829.920
ROH Out 1309.821
ROH Jump: 248.552
Reference Number: 792
Total Log likelihood: -22014.156
LL: -22014.155808

Parameters Current Step:
R

# Optimize LL for TSI copies
Idea: Have array of hmms [hmm, hmm1, ... ]. The LL Function updates the jump parameters, and calculates the total ll in parallel

### 1) Load HDF5

In [8]:
h5_path = "Simulated/1000G_Mosaic/TSI0/ch3/data.h5"  ### Which HDF to look into
f = h5py.File(h5_path, "r") # Load for Sanity Check. See below!
samples = f["samples"][:]

print(f"Loaded {len(samples)} targets")

Loaded 107 targets


### 2) Prepare HMM Object vector hmms

In [9]:
%%time
ch=3
n_ref=503
path_mosaic = "./Simulated/1000G_Mosaic/TSI0/ch3/"
exclude_pops = ["TSI", ]

hmms = [prep_hmm_object(path_mosaic, exclude_pops=exclude_pops, iid=iid, ch=ch, n_ref=n_ref) for iid in samples]


Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference

Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference

Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference

Loaded 77652 variants
Loaded 107 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI0/ch3/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240ch

In [5]:
### Some Functions for Printing of output
org_print = sys.stdout
def blockPrint():
    """Block Printing to Console"""
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    """Reanable Printing to Console"""
    sys.stdout = org_print

### Likelihood Functions
def ll_worker(hmm, roh_in, roh_out, roh_jump):
    """Update Parameters, and calculate Likelihood for worker hmm"""
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump) # Set the Parameters
    _, _, _, ll = hmm.calc_posterior(save = False, full = True)  # Calculate the LL
    return ll

def ll_mosaic_individuals(prms):
    """Calculate the log likelihood of Mosaic individual.
    ASSUME THAT hmms [:] array IS INITIALIZED PROPERLY!!!"""
    roh_in, roh_out, roh_jump = prms
    print("\nParameters Current Step:")
    print(f"ROH In {roh_in:.3f}")
    print(f"ROH Out {roh_out:.3f}")
    print(f"ROH Jump: {roh_jump:.3f}")
    
    ### Return infinite value for infeasible params:
    if np.min(prms)<0:   
        return np.inf 
    
    ### Prepare the Parameter Array:
    arguments = [[hmm0, roh_in, roh_out, roh_jump] for hmm0 in hmms]
    
    
    ### Do the Multiprocessing
    blockPrint()
    ll_vec = multi_run(ll_worker, arguments, procses)
    enablePrint()
    
    tot_ll = np.sum(ll_vec)
    print(f"LL: {tot_ll:.8f}")
    
    return -tot_ll  # Return Negetive one (for minimization)

def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)
    return results

In [None]:
%%time
### Run the Optimization (hmms array needs to be initialized!!)
procses = 10 ### How many processes in parallel 

x0 = np.array([1000, 2000, 500])  # The Starting Value (chosen according to single run)
res = minimize(ll_mosaic_individuals, x0, method='nelder-mead', options={'fatol': 1e-1, 'disp': True})

# Area51