In [60]:
from pathlib import Path
from gcp_interaction import read_blob_to_pandas, list_buckets, list_blobs
import json 
import numpy as np

In [30]:
with open("excess-energy-prediction-393ec78547e4.json", "r") as f:
    gcp_login_info = json.load(f)
with open("gcp_info.json", "r") as f:
    bucket_name = json.load(f)['bucket_name']

In [3]:
buckets = list_buckets(gcp_login_info['project_id'])
buckets

Buckets:


['excess-energy-raw-data']

In [7]:
blobs = list_blobs(project_id = gcp_login_info['project_id'], bucket_name = buckets[0])
blobs[0]

Blobs:


'Raw_Data/EirGrid/ALL/demandActual/2017/demandActual_ALL_2017_Apr.csv'

In [24]:
folders = []
for blob in blobs:
    blob = Path(blob)
    for idx, i in enumerate(blob.parts):
        if Path(i).suffix != '.csv':
            folders.append(str("-"*idx + i)) #"-" gives an idea of folder level 

In [27]:
list(dict.fromkeys(folders)) #removing duplicates whilst maintatining order

['Raw_Data',
 '-EirGrid',
 '--ALL',
 '---demandActual',
 '----2017',
 '----2018',
 '----2019',
 '----2020',
 '---generationActual',
 '---windActual',
 '--NI',
 '--ROI',
 '-met',
 '--BELMULLET',
 '---Data_Licence.pdf',
 '---Data_Licence.txt',
 '---KeyHourly.txt',
 '--CORK AIRPORT',
 '--DUBLIN AIRPORT',
 '--SHANNON AIRPORT']

This gives us a rough idea of the folder structure. I know that everything under -EirGrid -- ALL is repeated for --NI and --ROI. And everything under the Belmullet folder is repeated in Cork Dublin, and Shannon. 

# EIRGrid

## Demand

In [32]:
eir_grid_df = read_blob_to_pandas(bucket_name, blobs[0])

In [34]:
eir_grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            2880 non-null   int64 
 1   DATE & TIME           2880 non-null   object
 2    ACTUAL DEMAND(MW)    2880 non-null   int64 
 3    FORECAST DEMAND(MW)  2880 non-null   object
 4    REGION               2880 non-null   object
dtypes: int64(2), object(3)
memory usage: 112.6+ KB


Although it looks like there is no missing data, the 'FORECAST DEMAND(MW)' column is actually empty. Will need to replace these with NA's during cleaning, and transform the 'DATE & TIME' to proper datetime objects. 

In [35]:
eir_grid_df.head()

Unnamed: 0.1,Unnamed: 0,DATE & TIME,ACTUAL DEMAND(MW),FORECAST DEMAND(MW),REGION
0,0,1 April 2017 00:00,3575,-,All Island
1,1,1 April 2017 00:15,3568,-,All Island
2,2,1 April 2017 00:30,3508,-,All Island
3,3,1 April 2017 00:45,3430,-,All Island
4,4,1 April 2017 01:00,3389,-,All Island


In [42]:
eir_grid_df.columns

Index(['Unnamed: 0', 'DATE & TIME', ' ACTUAL DEMAND(MW)',
       ' FORECAST DEMAND(MW)', ' REGION'],
      dtype='object')

Note the whitespace at the start of the columns, this should be fixed during cleaning.

In [43]:
eir_grid_df.describe()[' ACTUAL DEMAND(MW)']

count    2880.000000
mean     3978.523611
std       680.850631
min      2655.000000
25%      3312.250000
50%      4067.000000
75%      4633.250000
max      5169.000000
Name:  ACTUAL DEMAND(MW), dtype: float64

In [46]:
eir_grid_df.nunique()

Unnamed: 0              2880
DATE & TIME             2880
 ACTUAL DEMAND(MW)      1558
 FORECAST DEMAND(MW)       1
 REGION                    1
dtype: int64

Looks like it's only for one region (all of Ireland) as expected. 

In [47]:
print(eir_grid_df['DATE & TIME'].min())
print(eir_grid_df['DATE & TIME'].max())

1 April 2017 00:00
9 April 2017 23:45


Looks like we have data for every 15 mins ranging from 1 April 2017 to 9 April 2017. 
Let's compare the later dataframes in this folder to check the structure looks the same. 

In [54]:
eir_grid_df = read_blob_to_pandas(bucket_name, blobs[30])

In [55]:
eir_grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            2880 non-null   int64 
 1   DATE & TIME           2880 non-null   object
 2    ACTUAL DEMAND(MW)    2880 non-null   object
 3    FORECAST DEMAND(MW)  2880 non-null   object
 4    REGION               2880 non-null   object
dtypes: int64(1), object(4)
memory usage: 112.6+ KB


Looks like Demand here is an object not an int, something to convert during cleaning. This is due to the presence of missing values denoted "-". Lets see how many values we're missing. 

In [63]:
eir_grid_df = eir_grid_df.replace({"-":np.nan})
eir_grid_df.isna().sum()

Unnamed: 0                 0
DATE & TIME                0
 ACTUAL DEMAND(MW)         1
 FORECAST DEMAND(MW)    2880
 REGION                    0
dtype: int64

Only a single value of actual demand is missing, not a big problem. 

In [62]:
eir_grid_df.head()

Unnamed: 0.1,Unnamed: 0,DATE & TIME,ACTUAL DEMAND(MW),FORECAST DEMAND(MW),REGION
0,0,1 June 2019 00:00,3466,,All Island
1,1,1 June 2019 00:15,3407,,All Island
2,2,1 June 2019 00:30,3323,,All Island
3,3,1 June 2019 00:45,3259,,All Island
4,4,1 June 2019 01:00,3197,,All Island


In [58]:
eir_grid_df.columns

Index(['Unnamed: 0', 'DATE & TIME', ' ACTUAL DEMAND(MW)',
       ' FORECAST DEMAND(MW)', ' REGION'],
      dtype='object')

The rest of this looks the same as the original dataframe, enough so to assume they are the same structure. 

## Generation

In [67]:
#getting first generationActual blob name
generation_actual_blob_name = [x for x in blobs if 'generationActual' in x][0] 
generation_actual_df = read_blob_to_pandas(bucket_name, generation_actual_blob_name)

In [69]:
generation_actual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              2880 non-null   int64 
 1   DATE & TIME             2880 non-null   object
 2    ACTUAL GENERATION(MW)  2880 non-null   int64 
 3    REGION                 2880 non-null   object
dtypes: int64(2), object(2)
memory usage: 90.1+ KB
