# Likelihood Analysis of Parameter space
This notebook contains analysis regarding the Likelihood of various parameter combinations.

Goal: Do Maximum Likelihood analysis (with Nelder Mead?)

In [20]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

import os as os
import multiprocessing as mp
import sys as sys
import socket

### Pick the right path (whether on cluster or at home)
if socket.gethostname() == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket.gethostname() == "midway2-0402.rcc.local":
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")
    
os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/")  # Since now we are in the Root Directory
from hmm_inference import HMM_Analyze   # Do not move. Should be after sys.path..
#sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
#from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

/home/harald/git/HAPSBURG
CPU Count: 4


### Need: Parallel, quick computation of likelihoods for target individuals
Problem: Bottleneck is I/O. I.e. load the 100 TSI individuals once - and then just update the parameters and recalculate

### Potential Plan:
Have the 100 TSI individuals in storage in Load Object. Have loading function there that provides the genotype Matrix of one of these Individuals (from storage)

### First try: Only do one individual

In [25]:
def prep_hmm_object(path_mosaic, prefix_out, exclude_pops, ch=3, n_ref=503):
    """Prepare and return a working HMM Object"""
    hmm = HMM_Analyze(cython=2, p_model="MosaicHDF5",
                      manual_load=True, save=False, save_fp=False, output=False)

    # Load and prepare the pre-processing Model
    hmm.load_preprocessing_model()              # Load the preprocessing Model
    hmm.p_obj.set_folder(path_mosaic)         # Set the Folder
    hmm.p_obj.set_prefix_out_data(prefix_out)
    hmm.p_obj.set_exclude_pops(pops=exclude_pops)

    hmm.load_data(iid=iid, ch=ch, n_ref=n_ref)  # Load the actual Data
    hmm.load_emission_model()
    hmm.load_transition_model()
    hmm.set_diploid_observations()             # To diploidize Individuals
    return hmm

def ll_mosaic_individual(hmm, roh_in=1, roh_out=10, roh_jump=100):
    """Calculate the Log Likelihood of Mosaic Individual"""    
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump)
    _, _, _, tot_ll = hmm.calc_posterior(save=False, full=True)
    return tot_ll

def ll_mosaic_individual(prms):
    """Calculate the log likelihood of Mosaic individual.
    ASSUME THAT HMM (hmm) IS INITIALIZED PROPERLY!!!"""
    roh_in, roh_out, roh_jump = prms
    print("\nParameters Current Step:")
    print(f"ROH In {roh_in:.3f}")
    print(f"ROH Out {roh_out:.3f}")
    print(f"ROH Jump: {roh_jump:.3f}")
    
    hmm.t_obj.set_params(roh_in=roh_in, roh_out=roh_out, roh_jump=roh_jump) # Set the Parameters
    _, _, _, tot_ll = hmm.calc_posterior(save = False, full = True)  # Calculate the LL
    
    print(f"LL: {tot_ll:.6f}")
    return -tot_ll  # Return Negetive one (for minimization)

### Load HMM Object

In [23]:
iid = "iid0"
ch=3
n_ref=503
path_mosaic = "./Simulated/1000G_Mosaic/TSI5/ch3_8cm/"
exclude_pops = ["TSI", ]

### Prepare HMM Object
hmm = prep_hmm_object(path_mosaic, prefix_out="", exclude_pops=exclude_pops, ch=ch, n_ref=n_ref)


Loaded 77652 variants
Loaded 100 individuals
HDF5 loaded from ./Simulated/1000G_Mosaic/TSI5/ch3_8cm/data.h5

Loaded 77652 variants
Loaded 503 individuals
HDF5 loaded from ./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr3.hdf5
396 / 503 Individuals included in Reference


In [24]:
### Test the log likelihood evaltions
#ll = ll_mosaic_individual(hmm, roh_in=1, roh_out=10, roh_jump=100)
ll = ll_mosaic_individual([1,10,100])
ll


Parameters Current Step:
ROH In 1.000
ROH Out 10.000
ROH Jump: 100.000
Reference Number: 792
Total Log likelihood: -24388.063
Likelihood: -24388.063309


24388.063309288642

# Optimize Function

In [21]:
x0 = np.array([1, 10, 100])  # The Starting Value

res = minimize(ll_mosaic_individual, x0, method='nelder-mead', options={'fatol': 1e-2, 'disp': True})


Parameters Current Step:
ROH In 1.000
ROH Out 10.000
ROH Jump: 100.000
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2326
Gaps bigger than 0.1 cM: 214
Maximum Gap: 0.2348 cM
Loaded Transition and Emission Matrix:
(3, 3)
(793, 77652, 2)
Loaded Observations:
(77652,)
Reference Number: 792
Total Log likelihood: -24286.566
Likelihood: -24286.565636

Parameters Current Step:
ROH In 1.050
ROH Out 10.000
ROH Jump: 100.000
Minimum Genetic Map: 0.0000
Maximum Genetic Map: 2.2326
Gaps bigger than 0.1 cM: 214
Maximum Gap: 0.2348 cM
Loaded Transition and Emission Matrix:
(3, 3)
(793, 77652, 2)
Loaded Observations:
(77652,)
Reference Number: 792


KeyboardInterrupt: 

In [None]:
print(res.x)

# Area51

In [None]:
### From HMM Object

def optimze_ll_transition_param(self, roh_trans_params):
    """Calculate and return the log likelihoods for Transitions Parameters
    roh_trans_params [m]"""
    m = len(roh_trans_params)

    ll_hoods = []  # The Vector for the log likelihoods

    for p in roh_trans_params:
        # Set the transition Parameters
        self.t_obj.set_params(roh_jump=p)
        _, _, _, tot_ll = self.calc_posterior(save=False, full=True)
        ll_hoods.append(tot_ll)

    return np.array(ll_hoods)