<a href="https://colab.research.google.com/github/kevinvbc/deeproad/blob/main/Gorilla_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Gorilla Data Engineer Assignment

In [1]:
import pandas as pd
import numpy as np

In [2]:
# get data from repo
!wget "https://raw.githubusercontent.com/kevinvbc/gorilla/master/gorilla_test_data.xlsx"
xls = pd.ExcelFile('gorilla_test_data.xlsx')
meter_list = pd.read_excel(xls, 'meter_list')
forecast_table = pd.read_excel(xls, 'forecast_table')
rate_table = pd.read_excel(xls, 'rate_table')

--2023-03-14 20:30:45--  https://raw.githubusercontent.com/kevinvbc/gorilla/master/gorilla_test_data.xlsx
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 121386 (119K) [application/octet-stream]
Saving to: ‘gorilla_test_data.xlsx.5’


2023-03-14 20:30:46 (8.18 MB/s) - ‘gorilla_test_data.xlsx.5’ saved [121386/121386]



##Exploratory Data Analysis + Preprocessing


In [3]:
# EDA
forecast_table

Unnamed: 0,meter_id,date,kwh
0,14676236,2020-06-01,22.070768
1,14676236,2020-06-02,19.170720
2,14676236,2020-06-03,23.555111
3,14676236,2020-06-04,18.220712
4,14676236,2020-06-05,14.196134
...,...,...,...
3407,88357331,2022-09-27,441.014725
3408,88357331,2022-09-28,441.512055
3409,88357331,2022-09-29,437.240657
3410,88357331,2022-09-30,517.608354


In [4]:
# aq_max_kwh column seems to have NaNs
rate_table['aq_max_kwh'].unique()
rate_table[(rate_table['aq_max_kwh'].isna())]

Unnamed: 0,date,exit_zone,aq_min_kwh,aq_max_kwh,rate_p_per_kwh
2,2020-04-01,EA1,732000,,0.2875
5,2020-04-01,EA2,732000,,0.1849
8,2020-04-01,EA3,732000,,0.2492
11,2020-04-01,EA4,732000,,0.2889
14,2020-04-01,EM1,732000,,0.2734
...,...,...,...,...,...
1127,2024-10-01,WA1,732000,,0.8735
1130,2024-10-01,WA2,732000,,0.5638
1133,2024-10-01,WM1,732000,,0.6461
1136,2024-10-01,WM2,732000,,0.7534


In [9]:
# fill NaNs with very large value for aq_max_kwh 
#max_day_forecast = forecast_table['kwh'].max() + 100
large_value = 5000000
rate_table['aq_max_kwh'] = rate_table['aq_max_kwh'].fillna(large_value)

# aq_max_kwh can be an integer, just like aq_min_kwh
rate_table['aq_max_kwh'] = rate_table['aq_max_kwh'].astype('int64')
rate_table

Unnamed: 0,date,exit_zone,aq_min_kwh,aq_max_kwh,rate_p_per_kwh
0,2020-04-01,EA1,0,73200,0.2652
1,2020-04-01,EA1,73200,732000,0.1980
2,2020-04-01,EA1,732000,5000000,0.2875
3,2020-04-01,EA2,0,73200,0.2970
4,2020-04-01,EA2,73200,732000,0.1524
...,...,...,...,...,...
1135,2024-10-01,WM2,73200,732000,0.4537
1136,2024-10-01,WM2,732000,5000000,0.7534
1137,2024-10-01,WM3,0,73200,0.7263
1138,2024-10-01,WM3,73200,732000,0.6109


In [10]:
# join forecast and meter table to access exit_zone information
# set index and using .join() is optimal way of joining pandas dataframes
forecast_per_zone = forecast_table.set_index('meter_id').join(meter_list.set_index('meter_id'), on = "meter_id", how = "left")
forecast_per_zone



Unnamed: 0_level_0,date,kwh,aq_kwh,exit_zone
meter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14676236,2020-06-01,22.070768,28978,EA1
14676236,2020-06-02,19.170720,28978,EA1
14676236,2020-06-03,23.555111,28978,EA1
14676236,2020-06-04,18.220712,28978,EA1
14676236,2020-06-05,14.196134,28978,EA1
...,...,...,...,...
88357331,2022-09-27,441.014725,484399,SE2
88357331,2022-09-28,441.512055,484399,SE2
88357331,2022-09-29,437.240657,484399,SE2
88357331,2022-09-30,517.608354,484399,SE2


Unnamed: 0,date,exit_zone,aq_min_kwh,aq_max_kwh,rate_p_per_kwh
0,2020-04-01,EA1,0,73200.000000,0.2652
1,2020-04-01,EA1,73200,732000.000000,0.1980
2,2020-04-01,EA1,732000,1269.400411,0.2875
3,2020-04-01,EA2,0,73200.000000,0.2970
4,2020-04-01,EA2,73200,732000.000000,0.1524
...,...,...,...,...,...
1135,2024-10-01,WM2,73200,732000.000000,0.4537
1136,2024-10-01,WM2,732000,1269.400411,0.7534
1137,2024-10-01,WM3,0,73200.000000,0.7263
1138,2024-10-01,WM3,73200,732000.000000,0.6109


In [25]:
def compare_annual_quantity(row):
    # Get the relevant rows from rate_table
    mask = (rate_table['date'] == row['date']) & (rate_table['exit_zone'] == row['exit_zone']) & (rate_table['aq_min_kwh'] <= row['aq_kwh']) & (rate_table['aq_max_kwh'] > row['aq_kwh'])
    relevant_rows = rate_table.loc[mask]

    # Return the 'other_field' values
    if len(relevant_rows) == 0:
        return None
    else:
        #return relevant_rows['rate_p_per_kwh'].values.tolist()
        return relevant_rows['rate_p_per_kwh']
        #return relevant_rows['rate_p_per_kwh']#.to_numpy()

# Apply the function to each row of df1
forecast_per_zone['relevant_rate_p_per_kwh'] = forecast_per_zone.apply(compare_annual_quantity, axis=1).astype('float')

In [26]:
forecast_per_zone['relevant_rate_p_per_kwh'].dtype

dtype('float64')

In [30]:
forecast_per_zone['relevant_rate_p_per_kwh'].ffil(axis=0)

AttributeError: ignored

In [1]:
forecast_per_zone

NameError: ignored

In [37]:
forecast_per_zone

Unnamed: 0_level_0,date,kwh,aq_kwh,exit_zone,relevant_rate
meter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14676236,2020-06-01,22.070768,28978,EA1,
14676236,2020-06-02,19.170720,28978,EA1,
14676236,2020-06-03,23.555111,28978,EA1,
14676236,2020-06-04,18.220712,28978,EA1,
14676236,2020-06-05,14.196134,28978,EA1,
...,...,...,...,...,...
88357331,2022-09-27,441.014725,484399,SE2,
88357331,2022-09-28,441.512055,484399,SE2,
88357331,2022-09-29,437.240657,484399,SE2,
88357331,2022-09-30,517.608354,484399,SE2,


In [27]:
rate_table['date'] = rate_table['date'].astype('datetime64[ns]')
rate_table['aq_max_kwh'] = rate_table['aq_max_kwh'].astype('int64')
forecast_table['date'] = forecast_table['date'].astype('datetime64[ns]')
rate_table['exit_zone'].astype('string')

IntCastingNaNError: ignored

In [25]:
daily_charge_table = forecast_table.set_index('date').join(rate_table.set_index('date'), how = "left")



In [26]:
daily_charge_table

Unnamed: 0_level_0,meter_id,kwh,exit_zone,aq_min_kwh,aq_max_kwh,rate_p_per_kwh
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01,,,EA1,0,73200.0,0.2652
2020-04-01,,,EA1,73200,732000.0,0.1980
2020-04-01,,,EA1,732000,,0.2875
2020-04-01,,,EA2,0,73200.0,0.2970
2020-04-01,,,EA2,73200,732000.0,0.1524
...,...,...,...,...,...,...
2024-10-01,,,WM2,73200,732000.0,0.4537
2024-10-01,,,WM2,732000,,0.7534
2024-10-01,,,WM3,0,73200.0,0.7263
2024-10-01,,,WM3,73200,732000.0,0.6109


In [12]:
forecast_table.dtypes

meter_id             int64
date        datetime64[ns]
kwh                float64
dtype: object

In [17]:
rate_table.dtypes

date              datetime64[ns]
exit_zone                 object
aq_min_kwh                 int64
aq_max_kwh               float64
rate_p_per_kwh           float64
dtype: object

In [None]:
# ... Vectorized operation:
df["ratio"] = 100 * (df["x"] / df["y"])

# ... Non-vectorized operation:
def calc_ratio(row):
    return 100 * (row["x"] / row["y"])

In [None]:
meter_list