In [None]:
import numpy 
import cupy 
import time
import importlib

## NumPy (CPU) vs. CuPy (GPU) -- math operations

In [None]:
def matrix_multiplication_helper(m, sh):
    m = importlib.import_module(mod)
    start = time.time()
    a = m.random.random(sh)
    a.dot(a)
    end = time.time()
    return end-start
    

def sum_helper(m, sh):
    m = importlib.import_module(mod)
    start = time.time()
    a = m.random.random(sh)
    a.sum
    end = time.time()
    return end-start


funcs = [matrix_multiplication_helper, sum_helper]
modules = ['numpy', 'cupy']
shapes = [(1000, 1000), (10000, 10000), (20000, 20000)]
duplicates = 3

for fun in funcs:
    print(f"\nFUN : ", fun)
    for mod in modules: 
        print('module : ',mod)
        for sh in shapes:
            print('shape : ', sh)
            dup_sum = 0
            for i in range(duplicates):
                exec_time = fun(mod,sh)
                print('time : ', exec_time)
                dup_sum += exec_time
                time.sleep(8)
            print('avg : ', dup_sum/(1.*duplicates))
            



## Pandas (CPU) vs. CuDF (GPU) -- data frame operations

In [None]:
data_dir = '<PATH_TO_DATA>'
file_name1 = 'yellow_tripdata_2015-01.csv'
file_name2 = 'yellow_tripdata_2015-02.csv'
data_path = data_dir + file_name1
data_path2 = [data_dir + file_name1, data_dir + file_name2] 

def read_csv(mod, data_path, nrows):
    m = importlib.import_module(mod)
    start = time.time()
    df = m.read_csv(data_path, nrows=nrows)
    end = time.time()
    return end-start

def merge_DataFrames(mod, data_path2, nrows):
    m = importlib.import_module(mod)
    start = time.time()
    dfs = [m.read_csv(p, nrows=nrows) for p in data_path2]
    merged_df = dfs[0].merge(dfs[1], on="trip_distance")
    end = time.time()
    return end-start


funcs = [read_csv, merge_DataFrames]
modules = ['pandas', 'cudf']
nrows = 10_000_000
duplicates = 4

for fun in funcs:
    print(f"\nFUN : ", fun)
    for mod in modules:
        print(f"\nModule: {mod}")
        dup_sum = 0
        for i in range(duplicates):
              exec_time = read_csv(mod, data_path, nrows)
              print(f"Run {i+1}: {exec_time:.4f} sec")
              dup_sum += exec_time
        print(f"Avg time: {dup_sum / duplicates:.4f} sec")


## SkLearn (CPU) vs CuML (GPU) -- Machine Learning Operations

In [None]:
import importlib
import numba, numba.cuda
import numpy as np
import pandas as pd
import time

mortgage_data_path = '<PATH_TO_DATA>'
mortgage_file = 'mortgage.npy.gz'



def load_data(nrows, ncols, cached, train_split=1.0, label_col=None):
    import gzip
    import os
   
    train_rows = int(nrows * train_split)

    if os.path.exists(cached):
        with gzip.open(cached) as f:
            X = np.load(f)

        if train_split < 1.0 and label_col is not None:
            X = X[:, [i for i in range(X.shape[1]) if i != label_col]]
            y = X[:, label_col : label_col + 1]
            rindices = np.random.randint(0, X.shape[0] - 1, nrows)
            X = X[rindices, :ncols]
            y = y[rindices]
            df_y_train = pd.DataFrame(
                {"fea%d" % i: y[0:train_rows, i] for i in range(y.shape[1])}
            )
            df_y_test = pd.DataFrame(
                {"fea%d" % i: y[train_rows:, i] for i in range(y.shape[1])}
            )
        else:
            X = X[np.random.randint(0, X.shape[0] - 1, nrows), :ncols]

    else:
        # throws FileNotFoundError error if mortgage dataset is not present
        raise FileNotFoundError(
            "Please download the required dataset or check the path"
        )

    if train_split < 1.0 and label_col is not None:
        df_X_train = pd.DataFrame(
            {"fea%d" % i: X[0:train_rows, i] for i in range(X.shape[1])}
        )
        df_X_test = pd.DataFrame(
            {"fea%d" % i: X[train_rows:, i] for i in range(X.shape[1])}
        )

        return {
            "X_train": df_X_train,
            "X_test": df_X_test,
            "y_train": df_y_train,
            "y_test": df_y_test,
        }
    else:
        df = pd.DataFrame({"fea%d" % i: X[:, i] for i in range(X.shape[1])})
        return df


def load_mortgage(mod, shape, data_path):
    data = load_data(nrows=shape[0], ncols=shape[1], cached=data_path, train_split=0.8, label_col=4)
    
    if mod == "cuml":
        import cudf
        if isinstance(data, dict):
            for k, v in data.items():
                data[k] = cudf.DataFrame.from_pandas(v)
            data["y_train"] = cudf.Series(data["y_train"]["fea0"])
        else:
            data = cudf.DataFrame.from_pandas(data)
    
    return {"module": mod, "data": data}
    

def run_benchmark(mod, shape, data_path, duplicates):
    for helper_func in [linear_regression]:
        print(f"\n{helper_func.__name__.replace('_helper', '').replace('_', ' ').title()} - Module: {mod}, Shape: {shape}")
        dup_sum = 0
        for i in range(duplicates):
            exec_time = helper_func(mod, shape, data_path)
            print(f"Run {i+1}: {exec_time:.4f} sec")
            dup_sum += exec_time
        print(f"Avg time: {dup_sum / duplicates:.4f} sec")





def linear_regression(mod, shape, data_path):
    if mod == "sklearn":
        m = importlib.import_module("sklearn.linear_model")
    else:
        m = importlib.import_module("cuml")
    
    data = load_mortgage(mod, shape, data_path)
    X_train = data["data"]["X_train"]
    y_train = data["data"]["y_train"]
    
    kwargs = {"fit_intercept": True}
    if mod == "cuml":
        kwargs["algorithm"] = "eig"
    
    lr = m.LinearRegression(**kwargs)
    
    start = time.time()
    lr.fit(X_train, y_train)
    end = time.time()
    return end - start
   
    
    

modules = ["sklearn", "cuml"]
shapes = [(int(2 ** 20), 512), (int(2 ** 21), 512), (int(2 ** 22), 512)]
data_path = mortgage_data_path + mortgage_file
duplicates = 4

for mod in modules:
    for shape in shapes:
        run_benchmark(mod, shape, data_path, duplicates)

