# Parallel Mosaic Creation 
Notebook to run parallel Mosaics creations

Has Notebooks that import the code for the Mosaic Creation, and then functions for various cases to parallelize it

@Author: Harald Ringbauer, June 2019

In [1]:
import numpy as np
import os as os
import sys as sys
import multiprocessing as mp
import socket

socket_name = socket.gethostname()
print(socket_name)
if socket_name == "VioletQueen":
    path = "/home/harald/git/HAPSBURG/"   # The Path on Harald's machine
elif socket_name.startswith("midway2"):
    print("Midway jnovmbre partition detected.")
    path = "/project/jnovembre/hringbauer/HAPSBURG/"  # The Path on Midway Cluster
else: 
    raise RuntimeWarning("Not compatible machine. Check!!")

os.chdir(path)  # Set the right Path (in line with Atom default)

sys.path.append("./Python3/create1000G_Mosaic/")  # Since now we are in the Root Directory
from createMosaicsMulti import Mosaic_1000G_Multi  # Import the object that can create the Multiruns

print(os.getcwd()) # Show the current working directory. Should be HAPSBURG/Notebooks/ParallelRuns
print(f"CPU Count: {mp.cpu_count()}")

midway2-0401.rcc.local
Midway jnovmbre partition detected.
/project/jnovembre/hringbauer/HAPSBURG
CPU Count: 28


# Function for the MultiRun

In [2]:
def create_individual_mosaic(base_path="./Simulated/1000G_Mosaic/TSI1/", 
                             path1000G="./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr",
                    pop_list=["TSI"], n=2, ch=3, chunk_length=0.005, l = 1, n_blocks=5):
    """Create Multiple ROH runs and saves combined data into base_path hdf5 and roh_info df
    base_path:  Start of SavePaths
    path1000G: Where to find the 1000 Genome Data
    pop_list: The Reference Populations for Mosaic
    n: Number of Individuals to simulate
    chunk_length: Lenths of the Chunks to mosaic
    ch: Chromosome to use
    l: Length of the 
    n_blocks: The NR of the Blocks to copy in"""
    
    ########### Pipe the output
    save_path = base_path + "ch" + str(ch) + "_" + str(int(l)) + "cm/"
    
    if not os.path.exists(save_path):
            os.makedirs(save_path)
    
    print(f"Setting save path...: {save_path}")
    sys.stdout = open(save_path + "mosaic_out.txt", 'w')
    
    t = Mosaic_1000G_Multi()  # Create the MltiRUn Object
    
    ##################################
    ### Set the parameters for the run
    t.pop_list = pop_list
    t.path1000G = path1000G
    t.n = n
    t.chunk_length = chunk_length
    t.ch = ch  # The Chromosome
    t.roh_lengths = np.ones(n_blocks) * 0.01 * l  # Set the Lengths
    t.save_path = save_path
    t.load_m_object()
    t.create_individuals()
    
def multi_run(fun, prms, processes = 4):
    """Implementation of running in Parallel.
    fun: Function
    prms: The Parameter Files
    processes: How many Processes to use"""
    print(f"Running {len(prms)} jobs in parallel.")
    
    with mp.Pool(processes = processes) as pool:
        results = pool.starmap(fun, prms)

### Multirun TSI Individuals

In [3]:
#### Create the parameters array for the starmap:
base_path="./Simulated/1000G_Mosaic/TSI5/" 
path1000G="./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr"
pop_list=["TSI"] 
n=100   # Number of Individuals
ch=3
chunk_length=0.0025
l = 0  # This will be overwritten here
n_blocks=5 # How many blocks will be copied in

### Create the List of Parameter Lists (input for starmap)
prms = []

for l in [2, 4, 6, 8, 10]:
    new_par = [base_path, path1000G, pop_list, n, ch, chunk_length, l, n_blocks]
    prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==8)   # The function takes 8 Parameters as input

In [5]:
multi_run(create_individual_mosaic, prms, processes = 8)

Running 5 jobs in parallel.
Setting save path...: ./Simulated/1000G_Mosaic/TSI5/ch3_6cm/
Setting save path...: ./Simulated/1000G_Mosaic/TSI5/ch3_4cm/
Setting save path...: ./Simulated/1000G_Mosaic/TSI5/ch3_10cm/
Setting save path...: ./Simulated/1000G_Mosaic/TSI5/ch3_2cm/
Setting save path...: ./Simulated/1000G_Mosaic/TSI5/ch3_8cm/


In [7]:
print("Run complete")

Run complete


### Create False Positive Individuals without any copied in blocks (for TSI)

In [5]:
#### Create the parameters array for the starmap:
base_path="./Simulated/1000G_Mosaic/TSI5/" 
path1000G="./Data/1000Genomes/HDF5/1240kHDF5/Eur1240chr"
pop_list=["TSI"] 
n=100   # Number of Individuals
ch=3
chunk_length=0.0025
l = 0  # No blocks copied in
n_blocks=0 # How many blocks will be copied in

### Create the List of Parameter Lists (input for starmap)
prms = []

new_par = [base_path, path1000G, pop_list, n, ch, chunk_length, l, n_blocks]
prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==8)   # The function takes 8 Parameters as input

In [6]:
multi_run(create_individual_mosaic, prms, processes = 4)
print("Run Complete!")

Running 1 jobs in parallel.
Setting save path...: ./Simulated/1000G_Mosaic/TSI5/ch3_0cm/
Run Complete!


In [7]:
print("Run Complete!")

Run Complete!


# Test another Population [CHB, CLM and YRI]
Loop through pops and through lengths

In [None]:
#### Create the parameters array for the starmap:
path1000G="./Data/1000Genomes/HDF5/1240kHDF5/NonEur1240chr"

n=100   # Number of Individuals
ch=3
chunk_length=0.0025
l = 0  # Will be overwritten down there
n_blocks = 0 # How many blocks will be copied in (Default: 5)

### Create the List of Parameter Lists (input for starmap)
prms = []

#target_pops = ["CHB", "CLM", "YRI"]
target_pops = ["CLM"]

for t in target_pops:
    base_path="./Simulated/1000G_Mosaic/" + str(t) + "/" 
    pop_list = [str(t)]
    
    #for l in [2, 4, 6, 8, 10]:
    for l in [0]:
        new_par = [base_path, path1000G, pop_list, n, ch, chunk_length, l, n_blocks]
        prms.append(new_par)  # Append to the Parameters

assert(len(prms[0])==8)   # The function takes 8 Parameters as input

In [None]:
multi_run(create_individual_mosaic, prms, processes = 5)

# Area 51

In [6]:
create_individual_mosaic(*prms[0])

Setting save path...: ./Simulated/1000G_Mosaic/CLM/ch3_0cm/


In [8]:
print("Run finished")

In [5]:
len(prms)

1

In [17]:
prms[0]

['./Simulated/1000G_Mosaic/CLM/',
 './Data/1000Genomes/HDF5/1240kHDF5/NonEur1240chr',
 ['CLM'],
 100,
 3,
 0.0025,
 0,
 0]