Going to start with cleaning the weather airport data first, because it's simpler than the energy data. 

In [45]:
from pathlib import Path
from gcp_interaction import read_blob_to_pandas, list_buckets, list_blobs
import json 
import numpy as np
import pandas as pd

In [46]:
with open("excess-energy-prediction-393ec78547e4.json", "r") as f:
    gcp_login_info = json.load(f)
with open("gcp_info.json", "r") as f:
    bucket_name = json.load(f)['bucket_name']

In [47]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [48]:
def clean_weather_df(df:pd.DataFrame) -> pd.DataFrame:
    MIN_DATE = "2017-01-01 00:00:00"
    df = reduce_mem_usage(df)
    df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True, errors='ignore')
    # fix space value
    df['wdsp'] = df['wdsp'].replace(to_replace= ' ', value=0)
    df['wdsp'] = df['wdsp'].astype('int32')
    # Reduce timeframe
    df = df[(df['date'] >= MIN_DATE)]
    return df

# Belmullet

In [49]:
belmullet_dir = r'Raw_Data/met/BELMULLET/belmullet.csv'
belmullet_df = read_blob_to_pandas(bucket_name, belmullet_dir, header=20,low_memory=False)
belmullet_df = clean_weather_df(belmullet_df)

Memory usage of dataframe is 93.22 MB
Memory usage after optimization is: 37.23 MB
Decreased by 60.1%


In [50]:
belmullet_df.head()

Unnamed: 0,date,ind,rain,ind.1,temp,ind.2,wetb,dewpt,vappr,rhum,...,ind.3,wdsp,ind.4,wddir,ww,w,sun,vis,clht,clamt
528513,2017-01-01 00:00:00,0,0.0,0,5.199219,0,3.900391,2.0,7.101562,79,...,2,13,2,340,,,,,,
528514,2017-01-01 01:00:00,0,0.5,0,4.699219,0,3.400391,1.400391,6.800781,78,...,2,15,2,350,,,,,,
528515,2017-01-01 02:00:00,0,0.0,0,5.699219,0,3.800781,0.700195,6.398438,70,...,2,16,2,360,,,,,,
528516,2017-01-01 03:00:00,0,0.399902,0,5.601562,0,3.300781,-0.600098,5.898438,64,...,2,19,2,360,,,,,,
528517,2017-01-01 04:00:00,0,0.600098,0,4.699219,0,3.099609,0.5,6.300781,74,...,2,20,2,10,,,,,,


In [51]:
belmullet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53329 entries, 528513 to 581841
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    53329 non-null  datetime64[ns]
 1   ind     53329 non-null  int8          
 2   rain    53329 non-null  float16       
 3   ind.1   53329 non-null  int8          
 4   temp    53329 non-null  float16       
 5   ind.2   53329 non-null  int8          
 6   wetb    53329 non-null  float16       
 7   dewpt   53329 non-null  float16       
 8   vappr   53329 non-null  float16       
 9   rhum    53329 non-null  int8          
 10  msl     53329 non-null  float16       
 11  ind.3   53329 non-null  int8          
 12  wdsp    53329 non-null  int32         
 13  ind.4   53329 non-null  int8          
 14  wddir   53329 non-null  category      
 15  ww      53329 non-null  category      
 16  w       53329 non-null  category      
 17  sun     53329 non-null  category      
 18  

# Dublin

In [52]:
dublin_dir = r"Raw_Data/met/DUBLIN AIRPORT/dublin.csv"
dublin_df = read_blob_to_pandas(bucket_name, dublin_dir, header=20,low_memory=False)
dublin_df = clean_weather_df(dublin_df)

Memory usage of dataframe is 112.46 MB
Memory usage after optimization is: 42.25 MB
Decreased by 62.4%


In [53]:
dublin_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53240 entries, 648696 to 701935
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    53240 non-null  datetime64[ns]
 1   ind     53240 non-null  int8          
 2   rain    53240 non-null  float16       
 3   ind.1   53240 non-null  int8          
 4   temp    53240 non-null  float16       
 5   ind.2   53240 non-null  int8          
 6   wetb    53240 non-null  float16       
 7   dewpt   53240 non-null  float16       
 8   vappr   53240 non-null  category      
 9   rhum    53240 non-null  category      
 10  msl     53240 non-null  float16       
 11  ind.3   53240 non-null  int8          
 12  wdsp    53240 non-null  int32         
 13  ind.4   53240 non-null  int8          
 14  wddir   53240 non-null  category      
 15  ww      53240 non-null  int8          
 16  w       53240 non-null  int8          
 17  sun     53240 non-null  float16       
 18  

# Shannon

In [54]:
shannon_dir = r"Raw_Data/met/SHANNON AIRPORT/shannon.csv"
shannon_df = read_blob_to_pandas(bucket_name, shannon_dir, header=20,low_memory=False)
shannon_df = clean_weather_df(shannon_df)

Memory usage of dataframe is 108.72 MB
Memory usage after optimization is: 41.39 MB
Decreased by 61.9%


In [55]:
shannon_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53240 entries, 625319 to 678558
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    53240 non-null  datetime64[ns]
 1   ind     53240 non-null  int8          
 2   rain    53240 non-null  float16       
 3   ind.1   53240 non-null  int8          
 4   temp    53240 non-null  float16       
 5   ind.2   53240 non-null  int8          
 6   wetb    53240 non-null  category      
 7   dewpt   53240 non-null  float16       
 8   vappr   53240 non-null  float16       
 9   rhum    53240 non-null  int8          
 10  msl     53240 non-null  float16       
 11  ind.3   53240 non-null  int8          
 12  wdsp    53240 non-null  int32         
 13  ind.4   53240 non-null  int8          
 14  wddir   53240 non-null  category      
 15  ww      53240 non-null  category      
 16  w       53240 non-null  category      
 17  sun     53240 non-null  float16       
 18  

# Cork

In [56]:
cork_dir = r"Raw_Data/met/CORK AIRPORT/cork.csv"
cork_df = read_blob_to_pandas(bucket_name, cork_dir, header=20,low_memory=False)
cork_df = clean_weather_df(cork_df)

Memory usage of dataframe is 85.78 MB
Memory usage after optimization is: 36.07 MB
Decreased by 57.9%


In [57]:
cork_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53240 entries, 482135 to 535374
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    53240 non-null  datetime64[ns]
 1   ind     53240 non-null  int8          
 2   rain    53240 non-null  category      
 3   ind.1   53240 non-null  int8          
 4   temp    53240 non-null  float16       
 5   ind.2   53240 non-null  int8          
 6   wetb    53240 non-null  category      
 7   dewpt   53240 non-null  float16       
 8   vappr   53240 non-null  category      
 9   rhum    53240 non-null  category      
 10  msl     53240 non-null  float16       
 11  ind.3   53240 non-null  int8          
 12  wdsp    53240 non-null  int32         
 13  ind.4   53240 non-null  int8          
 14  wddir   53240 non-null  int16         
 15  ww      53240 non-null  int8          
 16  w       53240 non-null  int8          
 17  sun     53240 non-null  category      
 18  

# Combining

In [58]:
belmullet_df.set_index(['date']);
dublin_df.set_index(['date']);
shannon_df.set_index(['date']);
cork_df.set_index(['date']);

In [72]:
left = belmullet_df.copy()
right = dublin_df.copy()

joined_df = left.join(right, lsuffix="_BEL", rsuffix="_DUB")

right = shannon_df.copy()
joined_df = joined_df.join(right, lsuffix="", rsuffix="_SHA")

right = cork_df.copy()
joined_df = joined_df.join(right, lsuffix="", rsuffix="_COR")

joined_df = joined_df.reset_index()

In [73]:
joined_df.shape

(53329, 85)

In [74]:
joined_df.head()

Unnamed: 0,index,date_BEL,ind_BEL,rain_BEL,ind.1_BEL,temp_BEL,ind.2_BEL,wetb_BEL,dewpt_BEL,vappr_BEL,...,ind.3_COR,wdsp_COR,ind.4_COR,wddir_COR,ww_COR,w_COR,sun_COR,vis_COR,clht_COR,clamt_COR
0,528513,2017-01-01 00:00:00,0,0.0,0,5.199219,0,3.900391,2.0,7.101562,...,2.0,7.0,2.0,200.0,61.0,66.0,0.0,20000,50,7
1,528514,2017-01-01 01:00:00,0,0.5,0,4.699219,0,3.400391,1.400391,6.800781,...,2.0,7.0,2.0,160.0,61.0,66.0,0.0,18000,4,7
2,528515,2017-01-01 02:00:00,0,0.0,0,5.699219,0,3.800781,0.700195,6.398438,...,2.0,11.0,2.0,170.0,21.0,62.0,0.0,30000,6,7
3,528516,2017-01-01 03:00:00,0,0.399902,0,5.601562,0,3.300781,-0.600098,5.898438,...,2.0,10.0,2.0,180.0,60.0,62.0,0.1,30000,7,7
4,528517,2017-01-01 04:00:00,0,0.600098,0,4.699219,0,3.099609,0.5,6.300781,...,2.0,12.0,2.0,190.0,21.0,62.0,0.2,30000,9,7


In [75]:
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53329 entries, 0 to 53328
Data columns (total 85 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   index      53329 non-null  int64         
 1   date_BEL   53329 non-null  datetime64[ns]
 2   ind_BEL    53329 non-null  int8          
 3   rain_BEL   53329 non-null  float16       
 4   ind.1_BEL  53329 non-null  int8          
 5   temp_BEL   53329 non-null  float16       
 6   ind.2_BEL  53329 non-null  int8          
 7   wetb_BEL   53329 non-null  float16       
 8   dewpt_BEL  53329 non-null  float16       
 9   vappr_BEL  53329 non-null  float16       
 10  rhum_BEL   53329 non-null  int8          
 11  msl_BEL    53329 non-null  float16       
 12  ind.3_BEL  53329 non-null  int8          
 13  wdsp_BEL   53329 non-null  int32         
 14  ind.4_BEL  53329 non-null  int8          
 15  wddir_BEL  53329 non-null  category      
 16  ww_BEL     53329 non-null  category     