In [1]:
# import needed packages
import multiprocessing as mp
import numpy as np
import pandas as pd
import parallelization_demo_model as pdm
import time
from typing import *


# read in data for random n
df_random_n = pd.read_csv("random_n.csv")
# set the number of values to iterate over 
n_iter = 20
n_iter = min(n_iter, len(df_random_n))

# setup some fields 
field_id = "random_n_id"
field_output = "function_calc"
field_random_n = "random_n"




In [2]:
# demonstration run using a large n
t0 = time.time()
n = 5000
rv = pdm.log_sum_binomial(n, None)
time.time() - t0

5.634175062179565

In [3]:
###########################
#    SERIAL 'FOR' LOOP    #
###########################

# initialize output values
vec_logsums = [0 for x in range(len(df_random_n))]

# set timer baseline
t0_serial = time.time()

# simple loop over
for i in range(n_iter):
    # important for this model to conver to int based on numerical issues
    vec_logsums[i] = pdm.log_sum_binomial(int(df_random_n[field_random_n].iloc[i]), int(df_random_n[field_id].iloc[i]))
    if i%round(n_iter/5) == 0:
        print("\t%s iterations complete"%(i + 1))

t1_serial = time.time()
t_elapse_serial = t1_serial - t0_serial

print("Serial run complete in %s seconds."%t_elapse_serial)



	1 iterations complete
	5 iterations complete
	9 iterations complete
	13 iterations complete
	17 iterations complete
Serial run complete in 77.94128203392029 seconds.


In [4]:
###############################
#    ASYNCHRONOUS PARALLEL    #
###############################

t0_par_async = time.time()

#
# SOLUTION TO GET APPLY_ASYNC TO WORK WITH JUPYTER LAB: FUNCTION HAS TO BE PLACED IN MODULE AND IMPORTED: https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3
# https://towardsdatascience.com/asynchronous-parallel-programming-in-python-with-multiprocessing-a3fc882b4023
#

# initialize output vector/array (pre-allocate memory)
vec_logsums_par_async = []

# set up dummy functions to get results
def _get_result(
    result: Any,
) -> None:
    
    global vec_logsums_par_async
    
    # update
    vec_logsums_par_async.append(result)
    
    return None

    

# check to ensure current module is "__main__"; this is necessary in scripts that use multiprocessing. Without it, the processing framework will run the entirety of the original script in parallel
if __name__ == "__main__":
    
    # start the MP pool for asynchronous parallelization
    pool = mp.Pool(int(mp.cpu_count()))

    # apply the function; note: if the function only takes one argument (e.g., f(x)), make sure the args is args = (x, ) - that extra comma is important
    for i in range(n_iter):
        pool.apply_async(
            # target function
            pdm.log_sum_binomial,
            # function arguments 
            args = (
                int(df_random_n[field_random_n].iloc[i]), 
                int(df_random_n[field_id].iloc[i])
            ),
            callback = _get_result,
        )

        """
        pseudocode attempt at describing this function:
            for i in 0:(n_iter - 1):
                assign task i to Pool
                use pdm.log_sum_binomial with arguments (int(df_random_n[field_random_n].iloc[i]), int(df_random_n[field_id].iloc[i]))
                when task i finishes, apply _get_result to the output
        """;

    pool.close()
    pool.join()
    t1_par_async = time.time()

    # 
    t_elapse_par_async = t1_par_async - t0_par_async

# print the reduction in time
print("Asynchronous parallelization across %s cores reduced computational time by %s%s."%(mp.cpu_count(), round(100*(1 - t_elapse_par_async/t_elapse_serial), 2), "%"))



Asynchronous parallelization across 14 cores reduced computational time by 87.05%.


In [6]:
# examine results from the pool, which gives us tuples with the random_n_id + the output value associated with it
vec_logsums_par_async


[(4, 2796.848873559379),
 (11, 2842.5965874763356),
 (9, 2874.4813577820933),
 (13, 2909.83186399065),
 (2, 2959.0453138104062),
 (12, 3044.9955641998395),
 (5, 3055.392771908239),
 (6, 3058.8585078110386),
 (8, 3180.852411589589),
 (7, 3223.8275367843057),
 (3, 3259.1780429928626),
 (14, 3284.8244886735806),
 (1, 3332.651644132217),
 (10, 3366.615855979654),
 (15, 2974.9876989632853),
 (20, 2775.361310962021),
 (19, 2891.116890115532),
 (17, 3015.190235435762),
 (18, 3176.693528506229),
 (16, 3415.1361586188505)]

In [15]:
##  verify the values shown above (interactive)
# set the random id to check
rand_id_check = 4
# get the applicable data row
row = df_random_n[df_random_n[field_id] == rand_id_check]
#
pdm.log_sum_binomial(int(row[field_random_n]), int(row[field_id]))


(4, 2796.848873559379)

In [16]:
##############################
#    SYNCHRONOUS PARALLEL    #
##############################

#
# check to ensure current module is "__main__"; this is necessary in scripts that use multiprocessing. Without it, the processing framework will run the entirety of the original script in parallel
# more on this is available at: https://docs.python.org/3/library/multiprocessing.html#multiprocessing-programming
#
# this approach is similar to running it in R
#

if __name__ == "__main__":
    
    # start the MP pool for asynchronous parallelization
    n_cores = mp.cpu_count()

    # copy the range
    list_task = list(range(n_iter))
    
    # set the outer return dictionary
    return_values = {}
    
    # set kill timer
    t0_par_sync = time.time()
    # upper threshold
    t_max = len(list_task)*20
    
    # set
    while ((len(list_task) > 0) & (time.time() - t0_par_sync < t_max)):
        
        # initialize the manager
        man = mp.Manager()
        return_dict = man.dict()
        
        # initialize the list of processes
        processes = []
        list_task_drop = []
        
        # start processes on available cores
        for i in range(min(n_cores, len(list_task))):
            
            # get the row index to work with
            ind = list_task[i]
            list_task_drop.append(ind)
            
            p = mp.Process(
                target = pdm.log_sum_binomial_sync,
                args = (int(df_random_n[field_random_n].iloc[ind]), int(df_random_n[field_id].iloc[ind]), return_dict)
            )
            
            processes.append(p)
            p.start()
            
        # loop to close and join after starting
        for p in processes:
            p.join()

        # update the return values
        return_values.update(return_dict)
        
        # reduce the task list
        list_task = [x for x in list_task if x not in list_task_drop]

    t1_par_sync = time.time()

    # 
    t_elapse_par_sync = t1_par_sync - t0_par_sync


# print the reduction in time
print("Synchronous parallelization across %s cores reduced computational time by %s%s."%(n_cores, round(100*(1 - t_elapse_par_sync/t_elapse_serial), 2), "%"))



Synchronous parallelization across 12 cores reduced computational time by 67.7%.


In [17]:
t_elapse_serial

177.7190670967102

In [18]:
t_elapse_par_async

50.61187791824341

In [19]:
t_elapse_par_sync

57.39674234390259

# Try aligning the output to the input in a single data frame
- use `pd.DataFrame` along with 

In [None]:
import graph_tool.all as gt
g = gt.collection.ns["kangaroo"]

In [30]:
%pip install graph_tool

[31mERROR: Could not find a version that satisfies the requirement graph_tool (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for graph_tool[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [33]:
y = f(w1:w6) = w1x1 + w2x2 + w3x3 + w4x4 + w5x5 + w6x6 where (x1,x2,x3,x4,x5,x6)=(4,-2,3.5,5,-11,-4.7) and y=44

SyntaxError: invalid syntax (2044189652.py, line 1)