In [None]:
import pandas as pd
import json
import datetime
import time
import os
import numpy as np
import lightgbm as lgb
from config import DATA_WEATHER_RAW_MAIN_FILE, DATA_WEATHER_RAW_AUXILIARY_FILE, DATA_BIM_RAW_FOLDER
from lgbm_imputer import imputer
import matplotlib

In [None]:
#get main weather data
weather_data = pd.read_csv(DATA_WEATHER_RAW_MAIN_FILE)
weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])
weather_data = weather_data.replace(-999, np.nan)
weather_data

In [None]:
#get main weather data
secondary_data = pd.read_csv(DATA_WEATHER_RAW_AUXILIARY_FILE)
secondary_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])
secondary_data = secondary_data.replace(-999, np.nan)
secondary_data

In [None]:
earliest_date = '2018-01-01 00:00:00'
latest_date = '2019-12-31 23:00:00'

In [None]:
#insert all the missing dates from the earliest date.
init_cols = weather_data.columns.tolist()
df_clean = weather_data.set_index('timestamp', drop=False).sort_index()

# construct full index w/o missing dates
full_index = pd.date_range(start=earliest_date, end=latest_date, freq='H')
df = df_clean.reindex(full_index)
df['timestamp'] = df.index
df.reset_index(drop=True, inplace=True)
df = df[init_cols]
df['RH (%)'].plot()

In [None]:
#insert all the missing dates from the earliest date for the auxiliary one
init_cols = secondary_data.columns.tolist()
df_clean = secondary_data.set_index('timestamp', drop=False).sort_index()

# construct full index w/o missing dates
full_index = pd.date_range(start=earliest_date, end=latest_date, freq='H')
df_secondary = df_clean.reindex(full_index)
df_secondary['timestamp'] = df_secondary.index
df_secondary.reset_index(drop=True, inplace=True)
df_secondary = df_secondary[init_cols]
df_secondary
df_secondary['RH (%)'].plot()

In [None]:
#repace missing values with those of the auxiliary dataset
lenght = secondary_data.shape[0]
df['global_rad_Whm2'] = secondary_data['Global Rad (MJ/m²)'] * 277.778
df['diffuse_rad_Whm2'] = secondary_data['Diffuse Rad (MJ/m²)'] * 277.778
for i in range(lenght):
    if np.isnan(df.loc[i, 'Temperature (C)']):
        df.loc[i, 'Temperature (C)'] = secondary_data.loc[i, 'Temperature (C)']
    if np.isnan(df.loc[i, 'RH (%)']):
        df.loc[i, 'RH (%)'] = secondary_data.loc[i, 'RH (%)']
    if np.isnan(df.loc[i, 'Scalar Mean Wind Speed (m/s)']):
        df.loc[i, 'Scalar Mean Wind Speed (m/s)'] = secondary_data.loc[i, 'Scalar Mean Wind Speed (m/s)']
df['RH (%)'].plot()

In [None]:
#calculate average values when possible and fill in
df.loc[df['Temperature (C)'] <= 0.0, 'Temperature (C)'] = np.nan #create nan whne values are not possible
df.loc[df['RH (%)'] <= 60.0, 'RH (%)']  = np.nan #create nan whne values are not possible
df.loc[df['global_rad_Whm2'] <= 0.0, 'global_rad_Whm2']  = 0.0 #create nan whne values are not possible
df.loc[df['diffuse_rad_Whm2'] <= 0.0, 'diffuse_rad_Whm2']  = 0.0 #create nan whne values are not possible
df.reset_index(drop=True, inplace=True)
df

In [None]:
%load_ext autoreload
%autoreload 2
df['Station'] = 'S44'
# lgb needs values that start in 0 if they are categorical
df['year'] = np.array(df['timestamp'].dt.year, dtype=np.uint16)
df['month'] = np.array(df['timestamp'].dt.month, dtype=np.uint8) - 1
df['dayofweek'] = np.array(df['timestamp'].dt.dayofweek, dtype=np.uint8)
df['dayofyear'] = np.array(df['timestamp'].dt.dayofyear, dtype=np.uint16) - 1
df['weekofyear'] = np.array(df['timestamp'].dt.weekofyear, dtype=np.uint8) - 1

target_feature_name = 'Temperature (C)'
numerical_features_list = ['RH (%)',
                             'Scalar Mean Wind Speed (m/s)',
                             'global_rad_Whm2',
                          'diffuse_rad_Whm2']
categorical_features_list = ['month',
                             'dayofyear',
                             'weekofyear',
                             'dayofweek']

for c in categorical_features_list:
    df[c] = df[c].astype('category')
    
id_column = 'Station'
get_best_parameters = False
window = 7
params = {'learning_rate': 0.1,
          'num_leaves': 255,
          'min_data_in_leaf': 50,
          'num_iterations': 2000,
          'objective': 'rmse',
          'metric': 'rmse'}
restored = imputer(df=df,
                   timestamp_feature_name='timestamp',
                   target_feature_name = target_feature_name,
                   numerical_features_list = numerical_features_list,
                   categorical_features_list = categorical_features_list,
                   id_column = id_column,
                   window = window,
                   get_best_parameters =get_best_parameters,
                   params=params)

In [None]:
visual = restored.set_index('timestamp')
visual = visual[[target_feature_name, f'{target_feature_name}_imputed']]
font = {'family' : 'Arial',
        'size'   : 18}
ax = visual.plot()
ax.set_xlabel("")
ax.set_ylabel("Daily Temperature [C]")
ax.legend(["Measured", "Imputed"]);
matplotlib.rc('font', **font)

In [None]:
%load_ext autoreload
%autoreload 2
df = restored.copy()
df['Station'] = 'S44'
# lgb needs values that start in 0 if they are categorical
df['year'] = np.array(df['timestamp'].dt.year, dtype=np.uint16)
df['month'] = np.array(df['timestamp'].dt.month, dtype=np.uint8) - 1
df['dayofweek'] = np.array(df['timestamp'].dt.dayofweek, dtype=np.uint8)
df['dayofyear'] = np.array(df['timestamp'].dt.dayofyear, dtype=np.uint16) - 1
df['weekofyear'] = np.array(df['timestamp'].dt.weekofyear, dtype=np.uint8) - 1

target_feature_name = 'RH (%)'
numerical_features_list = [  'Temperature (C)',
                             'Scalar Mean Wind Speed (m/s)',
                             'global_rad_Whm2',
                          'diffuse_rad_Whm2']
categorical_features_list = ['month',
                             'dayofyear',
                             'weekofyear',
                             'dayofweek']

for c in categorical_features_list:
    df[c] = df[c].astype('category')
    
id_column = 'Station'
get_best_parameters = False
window = 7
params = {'learning_rate': 0.1,
          'num_leaves': 255,
          'min_data_in_leaf': 50,
          'num_iterations': 2000,
          'objective': 'rmse',
          'metric': 'rmse'}
restored2 = imputer(df=df,
                   timestamp_feature_name='timestamp',
                   target_feature_name = target_feature_name,
                   numerical_features_list = numerical_features_list,
                   categorical_features_list = categorical_features_list,
                   id_column = id_column,
                   window = window,
                   get_best_parameters =get_best_parameters,
                   params=params)

In [None]:
visual = restored2.set_index('timestamp')
visual = visual[[target_feature_name, f'{target_feature_name}_imputed']]
font = {'family' : 'Arial',
        'size'   : 18}
ax = visual.plot()
ax.set_xlabel("")
ax.set_ylabel("Relative Humidity [%]")
ax.legend(["Measured", "Imputed"]);
matplotlib.rc('font', **font)

In [None]:
%load_ext autoreload
%autoreload 2
df = restored2.copy()
df['Station'] = 'S44'
# lgb needs values that start in 0 if they are categorical
df['year'] = np.array(df['timestamp'].dt.year, dtype=np.uint16)
df['month'] = np.array(df['timestamp'].dt.month, dtype=np.uint8) - 1
df['dayofweek'] = np.array(df['timestamp'].dt.dayofweek, dtype=np.uint8)
df['dayofyear'] = np.array(df['timestamp'].dt.dayofyear, dtype=np.uint16) - 1
df['weekofyear'] = np.array(df['timestamp'].dt.weekofyear, dtype=np.uint8) - 1

target_feature_name = 'Scalar Mean Wind Speed (m/s)'
numerical_features_list = ['Temperature (C)',
                             'RH (%)',
                             'global_rad_Whm2',
                          'diffuse_rad_Whm2']
categorical_features_list = ['month',
                             'dayofyear',
                             'weekofyear',
                             'dayofweek']
for c in categorical_features_list:
    df[c] = df[c].astype('category')
id_column = 'Station'
get_best_parameters = False
window = 3
params = {'learning_rate': 0.1,
          'num_leaves': 255,
          'min_data_in_leaf': 50,
          'num_iterations': 2000,
          'objective': 'rmse',
          'metric': 'rmse'}
restored3 = imputer(df=df,
                   timestamp_feature_name='timestamp',
                   target_feature_name = target_feature_name,
                   numerical_features_list = numerical_features_list,
                   categorical_features_list = categorical_features_list,
                   id_column = id_column,
                   window = window,
                   get_best_parameters =get_best_parameters,
                   params=params)

In [None]:
visual = restored3.set_index('timestamp')
visual = visual[[target_feature_name, f'{target_feature_name}_imputed']]
font = {'family' : 'Arial',
        'size'   : 18}
ax = visual.plot()
ax.set_xlabel("")
ax.set_ylabel("Daily Wind Speed [m/s]")
ax.legend(["Measured", "Imputed"]);
matplotlib.rc('font', **font)

In [None]:
%load_ext autoreload
%autoreload 2
df = restored3.copy()
df['Station'] = 'S44'
# lgb needs values that start in 0 if they are categorical
df['year'] = np.array(df['timestamp'].dt.year, dtype=np.uint16)
df['month'] = np.array(df['timestamp'].dt.month, dtype=np.uint8) - 1
df['dayofweek'] = np.array(df['timestamp'].dt.dayofweek, dtype=np.uint8)
df['dayofyear'] = np.array(df['timestamp'].dt.dayofyear, dtype=np.uint16) - 1
df['weekofyear'] = np.array(df['timestamp'].dt.weekofyear, dtype=np.uint8) - 1

target_feature_name = 'global_rad_Whm2'
numerical_features_list = ['Temperature (C)',
                             'RH (%)',
                             'Scalar Mean Wind Speed (m/s)',
                          'diffuse_rad_Whm2']
categorical_features_list = ['month',
                             'dayofyear',
                             'weekofyear',
                             'dayofweek']
for c in categorical_features_list:
    df[c] = df[c].astype('category')
id_column = 'Station'
get_best_parameters = False
window = 3
params = {'learning_rate': 0.1,
          'num_leaves': 255,
          'min_data_in_leaf': 50,
          'num_iterations': 2000,
          'objective': 'rmse',
          'metric': 'rmse'}
restored3 = imputer(df=df,
                   timestamp_feature_name='timestamp',
                   target_feature_name = target_feature_name,
                   numerical_features_list = numerical_features_list,
                   categorical_features_list = categorical_features_list,
                   id_column = id_column,
                   window = window,
                   get_best_parameters =get_best_parameters,
                   params=params)

In [None]:
visual = restored3.set_index('timestamp')
visual = visual[[target_feature_name, f'{target_feature_name}_imputed']]
font = {'family' : 'Arial',
        'size'   : 18}
ax = visual.plot()
ax.set_xlabel("")
ax.set_ylabel("global_rad_Whm2")
ax.legend(["Measured", "Imputed"]);
matplotlib.rc('font', **font)

In [None]:
%load_ext autoreload
%autoreload 2
df = restored3.copy()
df['Station'] = 'S44'
# lgb needs values that start in 0 if they are categorical
df['year'] = np.array(df['timestamp'].dt.year, dtype=np.uint16)
df['month'] = np.array(df['timestamp'].dt.month, dtype=np.uint8) - 1
df['dayofweek'] = np.array(df['timestamp'].dt.dayofweek, dtype=np.uint8)
df['dayofyear'] = np.array(df['timestamp'].dt.dayofyear, dtype=np.uint16) - 1
df['weekofyear'] = np.array(df['timestamp'].dt.weekofyear, dtype=np.uint8) - 1

target_feature_name = 'diffuse_rad_Whm2'
numerical_features_list = ['Temperature (C)',
                             'RH (%)',
                             'global_rad_Whm2',
                             'Scalar Mean Wind Speed (m/s)']
categorical_features_list = ['month',
                             'dayofyear',
                             'weekofyear',
                             'dayofweek']
for c in categorical_features_list:
    df[c] = df[c].astype('category')
id_column = 'Station'
get_best_parameters = False
window = 3
params = {'learning_rate': 0.1,
          'num_leaves': 255,
          'min_data_in_leaf': 50,
          'num_iterations': 2000,
          'objective': 'rmse',
          'metric': 'rmse'}
restored3 = imputer(df=df,
                   timestamp_feature_name='timestamp',
                   target_feature_name = target_feature_name,
                   numerical_features_list = numerical_features_list,
                   categorical_features_list = categorical_features_list,
                   id_column = id_column,
                   window = window,
                   get_best_parameters =get_best_parameters,
                   params=params)

In [None]:
months = [12, 11, 10, 9]

restored3['Temperature_C'] = restored3['Temperature (C)']
restored3['RH_%'] = restored3['RH (%)'] 
restored3['Wind_ms'] = restored3['Scalar Mean Wind Speed (m/s)']

restored3.loc[14500:,'Temperature_C'] = restored3['Temperature_C'][5740:8760].values #reeplace inexistend RH
restored3.loc[14500:, 'RH_%'] = restored3['RH_%'][5740:8760].values#reeplace inexistend RH
restored3.loc[14500:, 'Wind_ms'] = restored3['Wind_ms'][5740:8760].values#reeplace inexistend RH
restored3.loc[14500:,'global_rad_Whm2'] = restored3['global_rad_Whm2'][5740:8760].values#reeplace inexistend RH
restored3.loc[14500:,'diffuse_rad_Whm2'] = restored3['diffuse_rad_Whm2'][5740:8760].values#reeplace inexistend RH

restored3.loc[restored3['diffuse_rad_Whm2']>restored3['global_rad_Whm2'],'diffuse_rad_Whm2'] = 0.0#reeplace inexistend RH
restored3['direct_rad_Whm2'] = restored3['global_rad_Whm2'] - restored3['diffuse_rad_Whm2']

In [None]:
restored3[restored3['year'] == 2018][['Temperature_C', 'RH_%', 'Wind_ms', 'timestamp', 'global_rad_Whm2', 'diffuse_rad_Whm2', 'direct_rad_Whm2']].to_csv(os.path.join(DATA_BIM_RAW_FOLDER, "weather2018.csv"), index=False)
restored3[restored3['year'] == 2019][['Temperature_C', 'RH_%', 'Wind_ms', 'timestamp', 'global_rad_Whm2', 'diffuse_rad_Whm2', 'direct_rad_Whm2']].to_csv(os.path.join(DATA_BIM_RAW_FOLDER, "weather2019.csv"), index=False)

In [None]:
restored3[restored3['year'] == 2019][['Scalar Mean Wind Speed (m/s)', 'Temperature_C']]