In [20]:
# import pandas dataframe
import pandas as pd
import numpy as np
import os, os.path
import time
import itertools
from typing import *

# Directory Init
- Set the working directory here to start
- Use `os.path.join` to avoid system-dependent path structs
    - **Note**: If we are writing a python script, we can set the directory of the script/module that is being called by using `os.path.dirname(os.path.realpath(__file__))`

In [6]:
dir_project = "/Users/jsyme/Documents/Projects/git_jbus/dmdu_scalable_computing_2023"
dir_workbooks = os.path.join(dir_project, "workbooks")
dir_joins = os.path.join(dir_workbooks, "data_examples", "joins")
os.path.exists(dir_joins)

True

In [7]:
# note, if you launched the notebook from the directory you want to use as the base, you can use the following
os.path.dirname(os.getcwd())

'/Users/jsyme/Documents/Projects/git_jbus/dmdu_scalable_computing_2023'

# Syntax basics

##  if/else

In [13]:
# if statements can be executed in a few ways
x = 5

if x > 10:
    print("It's larger than 10!")
else:
    print("Failure. It's less than or equal to 10.")
    
    

Failure. It's less than or equal to 10.


In [9]:
print("It's larger than 10!") if (x > 10) else print("Failure. It's less than or equal to 10.")

Failure. It's less than or equal to 10.


In [15]:
y = 10.6 if (x > 10) else np.exp(4)
y

54.598150033144236

##  Iterating using for/while

In [16]:
for i in range(10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [17]:
i = 0
while i <= x:
    print(i)
    i += 1

0
1
2
3
4
5


##  Functions

In [19]:
# basic definition structure
def myfunc(x):
    out = x + 2
    
    return x

myfunc(4)

4

In [21]:
?myfunc

[0;31mSignature:[0m [0mmyfunc[0m[0;34m([0m[0mx[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      /var/folders/8m/3ll2cn6d1hdcs6gjqxr2jx5d2hffc9/T/ipykernel_8265/2404259888.py
[0;31mType:[0m      function


In [None]:
# some more advanced function elements
def myfunc(
    x: Union[float, int],
    param: Union[float, int] = 0.5,
) -> Union[float, int]:
    """
    Here's a docstring. `myfunc` transforms x using the parameter param
    
    Function Arguments
    ------------------
    
    Keyword Arguments
    -----------------
    - param: 
    """
    
    out = x + 2
    out **= param
    
    return x

myfunc(4)

# Read data, set some shared variables
- Good practice to set variables that will be called often instead of hard-coding (e.g., fields)
     - more generalizable, easier to change things, helps more systematically describe what's going on

In [47]:
# set some shared variables
field_key_future = field_key_future
field_key_strategy = field_key_strategy

fp_input_ex = os.path.join(dir_workbooks, "class_1_dataframes_example_input.csv")
fp_output_ex = os.path.join(dir_workbooks, "class_1_dataframes_example_output.csv")

# read in data frmes
df_input = pd.read_csv(fp_input_ex)
df_output = pd.read_csv(fp_output_ex)



In [48]:
##  EXAMINE THE TABLES

# input table has two input fields associated with two keys (future_id and strategy_id)
df_input

Unnamed: 0,future_id,strategy_id,input_1,input_2
0,0,1,0.77787,5.700752
1,1,1,0.511656,6.81565
2,2,1,0.96815,11.019067
3,11,1,0.402535,13.720394
4,12,1,0.662319,12.698714
5,13,1,0.15179,11.044997
6,14,1,0.118141,20.197099
7,0,2,0.729354,14.512959
8,1,2,0.108056,9.742377
9,2,2,0.103644,16.895573


In [49]:
# use the .head() method to see the first few rows; try .tail() to see the final few lines
df_input.head()

Unnamed: 0,future_id,strategy_id,input_1,input_2
0,0,1,0.77787,5.700752
1,1,1,0.511656,6.81565
2,2,1,0.96815,11.019067
3,11,1,0.402535,13.720394
4,12,1,0.662319,12.698714


In [50]:
# use .iloc to pull based on row/columns number

# loc uses the names of rows/columns 
df_input.loc[0:4]


Unnamed: 0,future_id,strategy_id,input_1,input_2
0,0,1,0.77787,5.700752
1,1,1,0.511656,6.81565
2,2,1,0.96815,11.019067
3,11,1,0.402535,13.720394
4,12,1,0.662319,12.698714


In [51]:
# iloc uses the index of rows and columns, which is not necessarily the same
df_input.iloc[0:4]

Unnamed: 0,future_id,strategy_id,input_1,input_2
0,0,1,0.77787,5.700752
1,1,1,0.511656,6.81565
2,2,1,0.96815,11.019067
3,11,1,0.402535,13.720394


In [55]:
# 
df2 = (
    df_input[
        (df_input[field_key_future] > 1) 
        & df_input[field_key_strategy].isin([2, 3])
    ].copy()
)
df2["new"] = 1
#df2 = df2.reset_index(drop = True)
df2.reset_index(drop = True, inplace = True)
df2

Unnamed: 0,future_id,strategy_id,input_1,input_2,new
0,2,2,0.103644,16.895573,1
1,11,2,0.819713,17.418069,1
2,12,2,1.047212,8.537533,1
3,13,2,0.499851,12.649008,1
4,14,2,0.776957,12.235339,1


# Now, 

In [57]:
# perform an inner join
df_inner = pd.merge(df_input, df_output)
# explicit
#df_inner = pd.merge(df_input, df_output, how = "inner", on = [field_key_future, field_key_strategy])
# explicity without shared fields 
# df_inner = pd.merge(df_input, df_output, how = "inner"), on = [])

df_inner

Unnamed: 0,future_id,strategy_id,input_1,input_2,metric_1,metric_2
0,0,1,0.77787,5.700752,9.284793,0.44426
1,1,1,0.511656,6.81565,12.024651,0.199272
2,2,1,0.96815,11.019067,0.266153,0.567921
3,0,2,0.729354,14.512959,-9.107368,3.04592
4,1,2,0.108056,9.742377,-0.278723,3.294757
5,2,2,0.103644,16.895573,2.455628,3.323217
6,0,4,-0.127865,13.827356,0.90602,0.183149
7,1,4,2.501769,18.53824,0.490319,0.347297
8,2,4,1.295715,18.783368,0.645601,0.1052


In [19]:
# perform an outer join
df_outer = pd.merge(df_input, df_output, how = "outer", on = [field_key_future, field_key_strategy])
df_outer


Unnamed: 0,future_id,strategy_id,input_1,input_2,metric_1,metric_2
0,0,1,0.77787,5.700752,9.284793,0.44426
1,1,1,0.511656,6.81565,12.024651,0.199272
2,2,1,0.96815,11.019067,0.266153,0.567921
3,11,1,0.402535,13.720394,,
4,12,1,0.662319,12.698714,,
5,13,1,0.15179,11.044997,,
6,14,1,0.118141,20.197099,,
7,0,2,0.729354,14.512959,-9.107368,3.04592
8,1,2,0.108056,9.742377,-0.278723,3.294757
9,2,2,0.103644,16.895573,2.455628,3.323217


In [34]:
# perform an left join; not
df_left = pd.merge(df_input, df_output, how = "left", on = [field_key_future, field_key_strategy])
df_left

Unnamed: 0,future_id,strategy_id,input_1,input_2,metric_1,metric_2
0,0,1,0.77787,5.700752,9.284793,0.44426
1,1,1,0.511656,6.81565,12.024651,0.199272
2,2,1,0.96815,11.019067,0.266153,0.567921
3,11,1,0.402535,13.720394,,
4,12,1,0.662319,12.698714,,
5,13,1,0.15179,11.044997,,
6,14,1,0.118141,20.197099,,
7,0,2,0.729354,14.512959,-9.107368,3.04592
8,1,2,0.108056,9.742377,-0.278723,3.294757
9,2,2,0.103644,16.895573,2.455628,3.323217


In [36]:
# for fun, confirm that the left join contains all values of future/strategy

# this creates a set (unique values) of (future_id, strategy_id) tuples contained in df_left, which performed a left join of df_output to df_input
fs_left = set([tuple(x) for x in np.array(df_left[[field_key_future, field_key_strategy]])])
# this creates a set (unique values) of (future_id, strategy_id) tuples contained in df_input, which df_output was left-joined to to create df_left
fs_input = set([tuple(x) for x in np.array(df_input[[field_key_future, field_key_strategy]])])

# are the sets of future/strategy combinations the same in df_left and df_input?
fs_input == fs_left


True

In [21]:
##  perform a right join
df_right = pd.merge(df_input, df_output, how = "right", on = [field_key_future, field_key_strategy])
df_right

Unnamed: 0,future_id,strategy_id,input_1,input_2,metric_1,metric_2
0,0,1,0.77787,5.700752,9.284793,0.44426
1,1,1,0.511656,6.81565,12.024651,0.199272
2,2,1,0.96815,11.019067,0.266153,0.567921
3,0,2,0.729354,14.512959,-9.107368,3.04592
4,1,2,0.108056,9.742377,-0.278723,3.294757
5,2,2,0.103644,16.895573,2.455628,3.323217
6,0,3,,,-1.881055,1.655185
7,1,3,,,1.99922,2.115266
8,2,3,,,3.462683,-0.114622
9,0,4,-0.127865,13.827356,0.90602,0.183149


##  Use the pd.DataFrame.join() method to join data frames on row indices
- contrasts with pd.merge(), which merges DataFrames on columns
- .join() merges on
    - a single column OR
    - index, which can be a multi-index
- .join() can be faster for joining a list of data frames that have low-dimensional indices


In [37]:
fields_in = [field_key_future, field_key_strategy]

df_in = df_input.copy()
df_in.set_index(fields_in, inplace = True)
df_out = df_output.copy()
df_out.set_index(fields_in, inplace = True)

# join types are the same
df_in.join(
    df_out,
    how = "inner"
)

Unnamed: 0_level_0,Unnamed: 1_level_0,input_1,input_2,metric_1,metric_2
future_id,strategy_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0.77787,5.700752,9.284793,0.44426
1,1,0.511656,6.81565,12.024651,0.199272
2,1,0.96815,11.019067,0.266153,0.567921
0,2,0.729354,14.512959,-9.107368,3.04592
1,2,0.108056,9.742377,-0.278723,3.294757
2,2,0.103644,16.895573,2.455628,3.323217
0,4,-0.127865,13.827356,0.90602,0.183149
1,4,2.501769,18.53824,0.490319,0.347297
2,4,1.295715,18.783368,0.645601,0.1052


In [99]:
#
#    perform some aggregations on grouped data frames
#    grouped dataframes group similar values across the fields specified in .groupby(fields_group)
#

# see what a grouped dataframe looks like when you apply the same function ("sum") to all non-grouped fields
df_output.groupby([field_key_future]).aggregate("sum")

Unnamed: 0_level_0,strategy_id,metric_1,metric_2
future_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,10,-0.797609,5.328513
1,10,14.235466,5.956591
2,10,6.830065,3.881717


In [110]:
# if desired, the index can be dropped 
df_output.groupby([field_key_future]).aggregate("sum").reset_index(drop = True)


Unnamed: 0,strategy_id,metric_1,metric_2
0,10,-0.797609,5.328513
1,10,14.235466,5.956591
2,10,6.830065,3.881717


In [28]:
#
#    alternatively, specify different 
#


# set up the aggregation dictionary: this tells pandas what function to apply to grouped fields when 
dict_agg = {field_key_strategy: "first", "metric_1": "sum", "metric_2": "mean"}

# note that this gives the sum of the field "metric_1" as grouped by strategy id and the mean of "metric_2" as grouped by strategy_id
df_output[list(dict_agg.keys())].groupby([field_key_strategy]).agg(dict_agg).reset_index(drop = True)

Unnamed: 0,strategy_id,metric_1,metric_2
0,1,21.575597,0.403818
1,2,-6.930463,3.221298
2,3,3.580848,1.21861
3,4,2.04194,0.211882


In [111]:
# an aggregated dataframe 
df_output[list(dict_agg.keys())].groupby([field_key_strategy]).aggregate(dict_agg).reset_index(drop = True)


Unnamed: 0,strategy_id,metric_1,metric_2
0,1,21.575597,0.403818
1,2,-6.930463,3.221298
2,3,3.580848,1.21861
3,4,2.04194,0.211882


## Practice some different ways to aggregate and join data frames

Assignment: files located in `./data_examples/joins` were produced using a large ensemble model. There are two sets of files--data and costs--each indexed by the primary key. We need to:

- Use the file system to identify which data files are available
    - Data files have form `data_##.csv`
    - Cost files have form `costs_##.csv`
- Read the data and cost data in
- Iterate over all primary_ids
- Join them together by primary_id and year to create a single dataframe
- Write this to a file

In [16]:
all_data = [x for x in os.listdir(dir_joins) if ("data_" in x) and (".csv" in x)]
all_costs = [x for x in os.listdir(dir_joins) if ("costs_" in x) and (".csv" in x)]

primary_keys_data = [int(x.split("_")[1].split(".")[0]) for x in all_data]
primary_keys_costs = [int(x.split("_")[1].split(".")[0]) for x in all_costs]

primary_ids = set(primary_keys_data) & set(primary_keys_costs)
primary_ids = sorted(list(primary_ids))



In [None]:
# method 1: append merge

df_out = []
primary_iterator = primary_ids
for i in enumerate(primary_iterator):

    ind, i = i
    # read in data and costs
    df_data = pd.read_csv(os.path.join(dir_joins, f"data_{i}.csv"))
    df_costs = pd.read_csv(os.path.join(dir_joins, f"costs_{i}.csv"))

    df_join = pd.merge(df_data, df_costs, how = "inner")

    if len(df_out) == 0:
        header = list(df_join.columns)
    df_out.append(df_join[header])

df_out = pd.concat(df_out, axis = 0).reset_index(drop = True)
df_out.to_csv(os.path.join(dir_project, "out_tmp.csv"), index = None, encoding = "UTF-8")

In [150]:
tup = (1, 2, 3, 9)
x, y, z, b = tup
print(x)
print(y)
print(z)
print(b)

1
2
3
9


In [148]:
for i in enumerate(primary_ids[30:]):
    print(i)
    x, y = i
    print(f"x:\t{x}\ny:\t{y}")
    print("")

(0, 30)
x:	0
y:	30

(1, 31)
x:	1
y:	31

(2, 32)
x:	2
y:	32

(3, 33)
x:	3
y:	33

(4, 47)
x:	4
y:	47

(5, 48)
x:	5
y:	48

(6, 49)
x:	6
y:	49



In [151]:
# method 2: allocation merge

df_out = []
primary_iterator = primary_ids
for enum in enumerate(primary_iterator):
    ind, i = enum
    # read in data and costs
    df_data = pd.read_csv(os.path.join(dir_joins, f"data_{i}.csv"))
    df_costs = pd.read_csv(os.path.join(dir_joins, f"costs_{i}.csv"))

    df_join = pd.merge(df_data, df_costs, how = "inner")

    if len(df_out) == 0:
        df_out = [df_join for x in primary_iterator]
        header = list(df_join.columns)
    else:
        df_out[ind] = df_join[header]

df_out = pd.concat(df_out, axis = 0).reset_index(drop = True)
df_out.to_csv(os.path.join(dir_project, "out_tmp.csv"), index = None, encoding = "UTF-8")

In [135]:
df_out

Unnamed: 0,primary_id,measure,year,field_1,field_2,field_3,field_4,field_5,field_6,field_7,...,cost_16,cost_17,cost_18,cost_19,cost_20,cost_21,cost_22,cost_23,cost_24,cost_25
0,0,measure_1,2022,0.995115,0.328997,0.317190,0.513936,0.970623,0.146586,0.216258,...,0.505722,0.945395,0.932059,0.715464,0.445840,0.426677,0.585624,0.129022,0.915180,0.488896
1,0,measure_1,2027,0.220008,0.094852,0.048479,0.506765,0.672168,0.622690,0.596176,...,0.829814,0.922322,0.285770,0.337601,0.470058,0.139796,0.536592,0.726086,0.007179,0.600679
2,0,measure_1,2032,0.376586,0.442025,0.433175,0.187461,0.523007,0.512473,0.498643,...,0.267855,0.148166,0.509212,0.857140,0.839486,0.348313,0.993532,0.463831,0.415010,0.153674
3,0,measure_1,2037,0.841287,0.510223,0.696173,0.023933,0.989354,0.968695,0.870554,...,0.341096,0.185644,0.019710,0.931788,0.282361,0.046773,0.174225,0.094647,0.832900,0.863558
4,0,measure_1,2042,0.295740,0.825153,0.941475,0.522561,0.511173,0.410016,0.537463,...,0.637391,0.833589,0.085055,0.782604,0.312977,0.964890,0.291057,0.773394,0.018417,0.156551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,49,measure_30,2027,0.997323,0.329980,0.777601,0.839730,0.613855,0.227249,0.061970,...,0.377459,0.568470,0.558003,0.960115,0.351779,0.019712,0.852452,0.841003,0.398385,0.183482
8996,49,measure_30,2032,0.488467,0.163077,0.568130,0.789814,0.310816,0.476588,0.433338,...,0.019844,0.643130,0.480064,0.597215,0.251843,0.653591,0.473823,0.766119,0.404827,0.389927
8997,49,measure_30,2037,0.428221,0.067823,0.325075,0.638990,0.109130,0.008784,0.110172,...,0.269311,0.470424,0.227329,0.811294,0.923770,0.953740,0.449054,0.334014,0.635072,0.967398
8998,49,measure_30,2042,0.718034,0.895582,0.555679,0.693872,0.037384,0.231688,0.592243,...,0.754182,0.077419,0.876369,0.889060,0.383725,0.715192,0.111391,0.765037,0.766165,0.604989


In [91]:
# method 2: allocate and replace



In [23]:
# method 3: write as you go

