In [1]:
from pathlib import Path
from gcp_interaction import read_blob_to_pandas, list_buckets, list_blobs
import json
import numpy as np

In [2]:
with open("excess-energy-prediction-393ec78547e4.json", "r") as f:
    gcp_login_info = json.load(f)
with open("gcp_info.json", "r") as f:
    bucket_name = json.load(f)["bucket_name"]

In [3]:
buckets = list_buckets(gcp_login_info["project_id"])
buckets

['excess-energy-raw-data']

In [4]:
blobs = list_blobs(project_id=gcp_login_info["project_id"], bucket_name=buckets[0])
blobs[0]

'Raw_Data/EirGrid/ALL/demandActual/2017/demandActual_ALL_2017_Apr.csv'

In [5]:
folders = []
for blob in blobs:
    blob = Path(blob)
    for idx, i in enumerate(blob.parts):
        if Path(i).suffix != ".csv":
            folders.append(str("-" * idx + i))  # "-" gives an idea of folder level

In [6]:
list(dict.fromkeys(folders))  # removing duplicates whilst maintatining order

['Raw_Data',
 '-EirGrid',
 '--ALL',
 '---demandActual',
 '----2017',
 '----2018',
 '----2019',
 '----2020',
 '---generationActual',
 '---windActual',
 '--NI',
 '--ROI',
 '-met',
 '--BELMULLET',
 '---Data_Licence.pdf',
 '---Data_Licence.txt',
 '---KeyHourly.txt',
 '--CORK AIRPORT',
 '--DUBLIN AIRPORT',
 '--SHANNON AIRPORT']

This gives us a rough idea of the folder structure. I know that everything under -EirGrid -- ALL is repeated for --NI and --ROI. And everything under the Belmullet folder is repeated in Cork Dublin, and Shannon. 

# EIRGrid

## Demand

In [15]:
eir_grid_df = read_blob_to_pandas(bucket_name, blobs[0])

In [16]:
eir_grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            2880 non-null   int64 
 1   DATE & TIME           2880 non-null   object
 2    ACTUAL DEMAND(MW)    2880 non-null   int64 
 3    FORECAST DEMAND(MW)  2880 non-null   object
 4    REGION               2880 non-null   object
dtypes: int64(2), object(3)
memory usage: 112.6+ KB


Although it looks like there is no missing data, the 'FORECAST DEMAND(MW)' column is actually empty. Will need to replace these with NA's during cleaning, and transform the 'DATE & TIME' to proper datetime objects. 

In [17]:
eir_grid_df.head()

Unnamed: 0.1,Unnamed: 0,DATE & TIME,ACTUAL DEMAND(MW),FORECAST DEMAND(MW),REGION
0,0,1 April 2017 00:00,3575,-,All Island
1,1,1 April 2017 00:15,3568,-,All Island
2,2,1 April 2017 00:30,3508,-,All Island
3,3,1 April 2017 00:45,3430,-,All Island
4,4,1 April 2017 01:00,3389,-,All Island


In [18]:
eir_grid_df.columns

Index(['Unnamed: 0', 'DATE & TIME', ' ACTUAL DEMAND(MW)',
       ' FORECAST DEMAND(MW)', ' REGION'],
      dtype='object')

Note the whitespace at the start of the columns, this should be fixed during cleaning.

In [19]:
eir_grid_df.describe()[" ACTUAL DEMAND(MW)"]

count    2880.000000
mean     3978.523611
std       680.850631
min      2655.000000
25%      3312.250000
50%      4067.000000
75%      4633.250000
max      5169.000000
Name:  ACTUAL DEMAND(MW), dtype: float64

In [20]:
eir_grid_df.nunique()

Unnamed: 0              2880
DATE & TIME             2880
 ACTUAL DEMAND(MW)      1558
 FORECAST DEMAND(MW)       1
 REGION                    1
dtype: int64

Looks like it's only for one region (all of Ireland) as expected. 

In [21]:
print(eir_grid_df["DATE & TIME"].min())
print(eir_grid_df["DATE & TIME"].max())

1 April 2017 00:00
9 April 2017 23:45


Looks like we have data for every 15 mins ranging from 1 April 2017 to 9 April 2017. 
Let's compare the later dataframes in this folder to check the structure looks the same. 

In [22]:
eir_grid_df = read_blob_to_pandas(bucket_name, blobs[30])

In [23]:
eir_grid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            2880 non-null   int64 
 1   DATE & TIME           2880 non-null   object
 2    ACTUAL DEMAND(MW)    2880 non-null   object
 3    FORECAST DEMAND(MW)  2880 non-null   object
 4    REGION               2880 non-null   object
dtypes: int64(1), object(4)
memory usage: 112.6+ KB


Looks like Demand here is an object not an int, something to convert during cleaning. This is due to the presence of missing values denoted "-". Lets see how many values we're missing. 

In [24]:
eir_grid_df = eir_grid_df.replace({"-": np.nan})
eir_grid_df.isna().sum()

Unnamed: 0                 0
DATE & TIME                0
 ACTUAL DEMAND(MW)         1
 FORECAST DEMAND(MW)    2880
 REGION                    0
dtype: int64

Only a single value of actual demand is missing, not a big problem. 

In [25]:
eir_grid_df.head()

Unnamed: 0.1,Unnamed: 0,DATE & TIME,ACTUAL DEMAND(MW),FORECAST DEMAND(MW),REGION
0,0,1 June 2019 00:00,3466,,All Island
1,1,1 June 2019 00:15,3407,,All Island
2,2,1 June 2019 00:30,3323,,All Island
3,3,1 June 2019 00:45,3259,,All Island
4,4,1 June 2019 01:00,3197,,All Island


In [26]:
eir_grid_df.columns

Index(['Unnamed: 0', 'DATE & TIME', ' ACTUAL DEMAND(MW)',
       ' FORECAST DEMAND(MW)', ' REGION'],
      dtype='object')

The rest of this looks the same as the original dataframe, enough so to assume they are the same structure. 

## Generation

In [27]:
# getting first generationActual blob name
generation_actual_blob_name = [x for x in blobs if "generationActual" in x][0]
generation_actual_df = read_blob_to_pandas(bucket_name, generation_actual_blob_name)

In [28]:
generation_actual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              2880 non-null   int64 
 1   DATE & TIME             2880 non-null   object
 2    ACTUAL GENERATION(MW)  2880 non-null   int64 
 3    REGION                 2880 non-null   object
dtypes: int64(2), object(2)
memory usage: 90.1+ KB


In [29]:
generation_actual_df.head()

Unnamed: 0.1,Unnamed: 0,DATE & TIME,ACTUAL GENERATION(MW),REGION
0,0,1 April 2017 00:00,3818,All Island
1,1,1 April 2017 00:15,3823,All Island
2,2,1 April 2017 00:30,3763,All Island
3,3,1 April 2017 00:45,3746,All Island
4,4,1 April 2017 01:00,3635,All Island


In [30]:
generation_actual_df.isna().sum()

Unnamed: 0                0
DATE & TIME               0
 ACTUAL GENERATION(MW)    0
 REGION                   0
dtype: int64

## Actual

In [31]:
# getting first generationActual blob name
wind_actual_blob_name = [x for x in blobs if "windActual" in x][0]
wind_actual_df = read_blob_to_pandas(bucket_name, wind_actual_blob_name)

In [32]:
wind_actual_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2880 entries, 0 to 2879
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          2880 non-null   int64 
 1   DATE & TIME         2880 non-null   object
 2    FORECAST WIND(MW)  2880 non-null   int64 
 3     ACTUAL WIND(MW)   2880 non-null   int64 
 4    REGION             2880 non-null   object
dtypes: int64(3), object(2)
memory usage: 112.6+ KB


In [33]:
wind_actual_df.head()

Unnamed: 0.1,Unnamed: 0,DATE & TIME,FORECAST WIND(MW),ACTUAL WIND(MW),REGION
0,0,1 April 2017 00:00,232,121,All Island
1,1,1 April 2017 00:15,249,112,All Island
2,2,1 April 2017 00:30,266,106,All Island
3,3,1 April 2017 00:45,283,94,All Island
4,4,1 April 2017 01:00,204,140,All Island


In [34]:
wind_actual_df.isna().sum()

Unnamed: 0            0
DATE & TIME           0
 FORECAST WIND(MW)    0
  ACTUAL WIND(MW)     0
 REGION               0
dtype: int64

# Met 

## Belmullet

The second half of our data includes weather info gathered from various airports around Ireland. 

In [28]:
belmullet_dir = r"Raw_Data/met/BELMULLET/belmullet.csv"
belmullet_df = read_blob_to_pandas(
    bucket_name, belmullet_dir, header=20, low_memory=False
)
belmullet_key_df = read_blob_to_pandas(
    bucket_name, belmullet_dir, sep="delimiter", header=None
)

  return pd.read_csv(StringIO(data_str), **kwargs)


The first 20 rows of the df contain info on the column headers and general weather station info. 

In [29]:
belmullet_info = belmullet_key_df[:20][0].to_list()
del belmullet_key_df
belmullet_info

['Station Name: BELMULLET',
 'Station Height: 9 M',
 'Latitude:54.228  ,Longitude: -10.007',
 'date:  -  Date and Time (utc)',
 'rain:  -  Precipitation Amount (mm)',
 'temp:  -  Air Temperature (C)',
 'wetb:  -  Wet Bulb Temperature (C)',
 'dewpt: -  Dew Point Temperature (C)',
 'rhum:  -  Relative Humidity (%)',
 'vappr: -  Vapour Pressure (hPa)',
 'msl:   -  Mean Sea Level Pressure (hPa)',
 'wdsp:  -  Mean Wind Speed (knot)',
 'wddir: -  Predominant Wind Direction (degree)',
 'ww:    -  Synop code for Present Weather',
 'w:     -  Synop code for Past Weather',
 'sun:   -  Sunshine duration (hours)',
 'vis:   -  Visibility (m)',
 "clht:  -  Cloud height (100's of ft) - 999 if none",
 'clamt: -  Cloud amount',
 'ind:   -  Indicator']

We aren't interested in some of these features and can drop them when we're cleaning. 

In [30]:
belmullet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581842 entries, 0 to 581841
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    581842 non-null  object 
 1   ind     581842 non-null  int64  
 2   rain    581842 non-null  float64
 3   ind.1   581842 non-null  int64  
 4   temp    581842 non-null  float64
 5   ind.2   581842 non-null  int64  
 6   wetb    581842 non-null  float64
 7   dewpt   581842 non-null  float64
 8   vappr   581842 non-null  float64
 9   rhum    581842 non-null  int64  
 10  msl     581842 non-null  float64
 11  ind.3   581842 non-null  int64  
 12  wdsp    581842 non-null  object 
 13  ind.4   581842 non-null  int64  
 14  wddir   581842 non-null  object 
 15  ww      581842 non-null  object 
 16  w       581842 non-null  object 
 17  sun     581842 non-null  object 
 18  vis     581842 non-null  object 
 19  clht    581842 non-null  object 
 20  clamt   581842 non-null  object 
dtypes: float64

In [21]:
belmullet_df.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.3,wdsp,ind.4,wddir,ww,w,sun,vis,clht,clamt
0,16-sep-1956 15:00,0,0.0,0,14.4,0,12.1,10.0,12.4,76,...,1,0,1,0,1,2,0.1,16000,200,5
1,16-sep-1956 16:00,0,0.0,0,13.9,0,11.6,9.4,11.9,75,...,1,0,1,0,1,1,0.1,24000,999,4
2,16-sep-1956 17:00,0,0.0,0,14.2,0,12.0,10.0,12.3,76,...,1,0,1,0,3,1,0.4,19000,90,5
3,16-sep-1956 18:00,0,0.0,0,13.0,0,11.6,10.0,12.6,85,...,1,0,1,0,1,1,0.0,30000,45,4
4,16-sep-1956 19:00,0,0.0,0,12.6,0,11.7,11.1,13.1,90,...,1,0,1,0,3,2,0.0,28000,40,7


In [31]:
print(belmullet_df["date"].min())
print(belmullet_df["date"].max())

01-apr-1957 00:00
31-oct-2022 23:00


We have hourly data from 1957 to 2022. 

## Dublin

In [33]:
dublin_dir = r"Raw_Data/met/DUBLIN AIRPORT/dublin.csv"
dublin_df = read_blob_to_pandas(bucket_name, dublin_dir, header=20, low_memory=False)
dublin_key_df = read_blob_to_pandas(
    bucket_name, dublin_dir, sep="delimiter", header=None
)

  return pd.read_csv(StringIO(data_str), **kwargs)


In [36]:
dublin_info = dublin_key_df[:20][0].to_list()
del dublin_key_df
dublin_info

['Station Name: DUBLIN AIRPORT',
 'Station Height: 71 M',
 'Latitude:53.428  ,Longitude: -6.241',
 'date:  -  Date and Time (utc)',
 'rain:  -  Precipitation Amount (mm)',
 'temp:  -  Air Temperature (C)',
 'wetb:  -  Wet Bulb Temperature (C)',
 'dewpt: -  Dew Point Temperature (C)',
 'rhum:  -  Relative Humidity (%)',
 'vappr: -  Vapour Pressure (hPa)',
 'msl:   -  Mean Sea Level Pressure (hPa)',
 'wdsp:  -  Mean Wind Speed (knot)',
 'wddir: -  Predominant Wind Direction (degree)',
 'ww:    -  Synop code for Present Weather',
 'w:     -  Synop code for Past Weather',
 'sun:   -  Sunshine duration (hours)',
 'vis:   -  Visibility (m)',
 "clht:  -  Cloud height (100's of ft) - 999 if none",
 'clamt: -  Cloud amount',
 'ind:   -  Indicator']

In [37]:
dublin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701936 entries, 0 to 701935
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    701936 non-null  object 
 1   ind     701936 non-null  int64  
 2   rain    701936 non-null  float64
 3   ind.1   701936 non-null  int64  
 4   temp    701936 non-null  float64
 5   ind.2   701936 non-null  int64  
 6   wetb    701936 non-null  float64
 7   dewpt   701936 non-null  float64
 8   vappr   701936 non-null  object 
 9   rhum    701936 non-null  object 
 10  msl     701936 non-null  float64
 11  ind.3   701936 non-null  int64  
 12  wdsp    701936 non-null  int64  
 13  ind.4   701936 non-null  int64  
 14  wddir   701936 non-null  object 
 15  ww      701936 non-null  int64  
 16  w       701936 non-null  int64  
 17  sun     701936 non-null  float64
 18  vis     701936 non-null  object 
 19  clht    701936 non-null  object 
 20  clamt   701936 non-null  object 
dtypes: float64

In [38]:
dublin_df.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.3,wdsp,ind.4,wddir,ww,w,sun,vis,clht,clamt
0,01-jan-1943 00:00,0,0.4,0,7.2,0,6.8,6.1,9.5,93,...,1,13,1,240,61,6,0.0,10000,9,8
1,01-jan-1943 01:00,0,0.7,0,7.8,0,7.6,7.2,10.2,96,...,1,19,1,240,61,6,0.0,10000,8,8
2,01-jan-1943 02:00,0,0.5,0,8.7,0,8.3,7.7,10.7,95,...,1,24,1,250,51,6,0.0,7000,7,8
3,01-jan-1943 03:00,2,0.0,0,9.1,0,8.7,8.3,11.0,95,...,1,24,1,270,50,6,0.0,10000,9,7
4,01-jan-1943 04:00,2,0.0,0,9.4,0,8.8,8.3,10.9,93,...,1,24,1,270,50,5,0.0,10000,8,8


In [39]:
print(dublin_df["date"].min())
print(dublin_df["date"].max())

01-apr-1943 00:00
31-oct-2022 23:00
