In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,5 days 10 hours 2 mins
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.4
H2O_cluster_version_age:,"21 days, 5 hours and 49 minutes"
H2O_cluster_name:,H2O_from_python_per_christian_43xdwe
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.609 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

In [4]:
X_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
X_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
X_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')


In [5]:
X_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
X_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
X_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')



In [6]:
X_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
X_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
X_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')


# Things to Explore

1. Location B and C yields almost exact results on first model -> Are the two datasets very similar?
2. y_train_c has a lot of NaN values compared to A and B
3. Cyclic encoding

# Data Preprocessing 
(Move this to seperate notebook)

**1. Merge observed and estimated training data**

**2. Handle NaN values**
- Look through all features and handle seperately, according to nature of the feature

**3.  Aggregate weather data from every 15 minute to hourly.**
- Average, sum, max/min, first/last etc. depending on the nature of the metric.
- Ensure the timestamps between weather data and power production align.

**4. Handle missing solar power measurements from 2022-10-21 01:00 - 2022-10-28 21:00**

### 1. Merge observed and estimated training data**

In [7]:
def merge_training_data(location):
    if location.lower() == 'a':
        X_train = pd.concat([X_train_observed_a,X_train_estimated_a])
    elif location.lower() == 'b':
        X_train = pd.concat([X_train_observed_b,X_train_estimated_b])
    elif location.lower() == 'c':
        X_train = pd.concat([X_train_observed_c,X_train_estimated_c])
    else:
        raise ValueError(f"Invalid location: {location}")
    return X_train

In [8]:
#Consider removing date_calc column
X_train_a = merge_training_data('a')
X_train_b = merge_training_data('b')
X_train_c = merge_training_data('c')

In [9]:
X_train_a

Unnamed: 0,date_forecast,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,date_calc
0,2019-06-02 22:00:00,7.7,1.230,1744.900024,0.0,0.0,1744.900024,0.0,280.299988,0.0,0.0,0.0,0.0,98.699997,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1006.799988,0.0,0.0,994.200012,1000.299988,0.0,0.0,73.099998,1006.299988,,0.0,0.0,-0.0,0.1,342.834015,-3.202,0.0,285.899994,100.000000,39640.101562,3.7,-3.6,-0.8,-0.0,NaT
1,2019-06-02 22:15:00,7.7,1.229,1734.000000,0.0,0.0,1734.000000,0.0,280.299988,0.0,0.0,0.0,0.0,99.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1006.500000,0.0,0.0,993.900024,999.900024,0.0,0.0,72.199997,1006.000000,,0.0,0.0,-0.0,0.2,346.294006,-3.650,0.0,286.100006,100.000000,40123.898438,3.6,-3.6,-0.6,-0.0,NaT
2,2019-06-02 22:30:00,7.7,1.228,1723.500000,0.0,0.0,1723.500000,0.0,280.299988,0.0,0.0,0.0,0.0,99.199997,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1006.099976,0.0,0.0,993.599976,999.599976,0.0,0.0,71.199997,1005.599976,,0.0,0.0,-0.0,0.2,349.768005,-3.998,0.0,286.299988,100.000000,40628.300781,3.6,-3.6,-0.4,-0.0,NaT
3,2019-06-02 22:45:00,7.7,1.226,1713.400024,0.0,0.0,1713.400024,0.0,280.299988,0.0,0.0,0.0,0.0,99.400002,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1005.799988,0.0,0.0,993.299988,999.299988,0.0,0.0,70.199997,1005.299988,,0.0,0.0,-0.0,0.2,353.251007,-4.247,0.0,286.600006,100.000000,41153.601562,3.5,-3.5,-0.2,-0.0,NaT
4,2019-06-02 23:00:00,7.7,1.225,1703.599976,0.0,0.0,1703.599976,0.0,280.299988,0.0,0.0,0.0,0.0,99.599998,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1005.500000,0.0,0.0,993.000000,999.000000,0.0,0.0,69.199997,1005.000000,,0.0,0.0,-0.0,0.2,356.742004,-4.393,0.0,286.799988,100.000000,41699.898438,3.5,-3.5,0.0,-0.0,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17571,2023-04-30 22:45:00,4.5,1.281,1173.900024,0.0,0.0,536.500000,0.0,272.299988,0.0,0.0,0.0,0.0,99.599998,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1014.299988,0.0,0.0,1001.099976,1007.400024,0.0,0.0,79.599998,1013.700012,,0.0,0.0,-0.0,0.1,352.920013,-11.731,0.1,274.200012,99.900002,11629.299805,3.9,2.9,2.5,-0.0,2023-04-29 07:00:05
17572,2023-04-30 23:00:00,4.5,1.281,1054.199951,0.0,0.0,542.400024,0.0,272.200012,0.0,0.0,0.0,0.0,100.000000,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1014.299988,0.0,0.0,1001.099976,1007.299988,0.0,0.0,80.000000,1013.599976,,0.0,0.0,-0.0,0.1,356.634003,-11.884,0.1,274.200012,100.000000,9923.200195,3.7,2.8,2.4,-0.0,2023-04-29 07:00:05
17573,2023-04-30 23:15:00,4.5,1.281,1435.800049,0.0,0.0,531.799988,0.0,272.299988,0.0,0.0,0.0,0.0,99.400002,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1014.099976,0.0,0.0,1000.900024,1007.200012,0.0,0.0,79.900002,1013.500000,,0.0,0.0,-0.0,0.1,0.352000,-11.928,0.1,274.200012,99.900002,11230.799805,3.7,2.7,2.5,-0.0,2023-04-29 07:00:05
17574,2023-04-30 23:30:00,4.5,1.281,1817.400024,0.0,0.0,521.200012,0.0,272.299988,0.0,0.0,0.0,0.0,98.099998,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1014.000000,0.0,0.0,1000.799988,1007.000000,0.0,0.0,79.800003,1013.299988,,0.0,0.0,-0.0,0.0,4.070000,-11.864,0.1,274.200012,99.599998,12526.099609,3.8,2.7,2.6,-0.0,2023-04-29 07:00:05


### 2. Handle NaN values

In [10]:
##

### 3. Aggregate weather data from every 15 minute to hourly averages

In [11]:
agg_dict1 = {'clear_sky_energy_1h:J': 'sum',
 'diffuse_rad_1h:J': 'sum',
 'direct_rad_1h:J': 'sum',
 'precip_5min:mm': 'sum',
 'fresh_snow_1h:cm': 'sum',
 'fresh_snow_3h:cm': 'sum',
 'snow_melt_10min:mm': 'sum',
 'rain_water:kgm2': 'sum',
 'super_cooled_liquid_water:kgm2': 'sum',
 'wind_speed_10m:ms': 'max',
 'dew_or_rime:idx': 'sum',
 'is_day:idx': 'sum',
 'is_in_shadow:idx': 'sum',
 'snow_drift:idx': 'sum',
 'precip_type_5min:idx': 'sum',
 'absolute_humidity_2m:gm3': 'mean',
 'air_density_2m:kgm3': 'mean',
 'ceiling_height_agl:m': 'mean',
 'clear_sky_rad:W': 'mean',
 'cloud_base_agl:m': 'mean',
 'dew_point_2m:K': 'mean',
 'diffuse_rad:W': 'mean',
 'direct_rad:W': 'mean',
 'effective_cloud_cover:p': 'mean',
 'elevation:m': 'mean',
 'msl_pressure:hPa': 'mean',
 'pressure_100m:hPa': 'mean',
 'pressure_50m:hPa': 'mean',
 'prob_rime:p': 'mean',
 'relative_humidity_1000hPa:p': 'mean',
 'sfc_pressure:hPa': 'mean',
 'snow_density:kgm3': 'mean',
 'snow_depth:cm': 'mean',
 'sun_azimuth:d': 'mean',
 'sun_elevation:d': 'mean',
 't_1000hPa:K': 'mean',
 'total_cloud_cover:p': 'mean',
 'visibility:m': 'mean',
 'wind_speed_u_10m:ms': 'mean',
 'wind_speed_v_10m:ms': 'mean',
 'wind_speed_w_1000hPa:ms': 'mean',
 'fresh_snow_12h:cm': 'last',
 'snow_water:kgm2': 'mean',
 'fresh_snow_24h:cm':'last',
 'fresh_snow_6h:cm':'last'}

agg_dict2 = {
 'absolute_humidity_2m:gm3': 'mean',
 'air_density_2m:kgm3': 'mean',
 'ceiling_height_agl:m': 'mean',
 'clear_sky_energy_1h:J': 'sum',
 'clear_sky_rad:W': 'mean',
 'cloud_base_agl:m': 'mean',
 'dew_or_rime:idx': 'mode',
 'dew_point_2m:K': 'mean',
 'diffuse_rad:W': 'mean',
 'diffuse_rad_1h:J': 'sum',
 'direct_rad:W': 'mean',
 'direct_rad_1h:J': 'sum',
 'effective_cloud_cover:p': 'mean',
 'elevation:m': 'mean',
 'fresh_snow_12h:cm': 'sum',
 'fresh_snow_1h:cm': 'sum',
 'fresh_snow_24h:cm': 'sum',
 'fresh_snow_3h:cm': 'sum',
 'fresh_snow_6h:cm': 'sum',
 'is_day:idx': 'max',
 'is_in_shadow:idx': 'max',
 'msl_pressure:hPa': 'mean',
 'precip_5min:mm': 'sum',
 'precip_type_5min:idx': 'mode',
 'pressure_100m:hPa': 'mean',
 'pressure_50m:hPa': 'mean',
 'prob_rime:p': 'mean',
 'rain_water:kgm2': 'sum',
 'relative_humidity_1000hPa:p': 'mean',
 'sfc_pressure:hPa': 'mean',
 'snow_density:kgm3': 'mean',
 'snow_depth:cm': 'mean',
 'snow_drift:idx': 'mode',
 'snow_melt_10min:mm': 'sum',
 'snow_water:kgm2': 'sum',
 'sun_azimuth:d': 'mean',
 'sun_elevation:d': 'mean',
 'super_cooled_liquid_water:kgm2': 'max',
 't_1000hPa:K': 'mean',
 'total_cloud_cover:p': 'mean',
 'visibility:m': 'mean',
 'wind_speed_10m:ms': 'mean',
 'wind_speed_u_10m:ms': 'mean',
 'wind_speed_v_10m:ms': 'mean',
 'wind_speed_w_1000hPa:ms': 'mean'
}


remaining = list(set(X_train_b.columns) - set(agg_dict2.keys()))


In [14]:
def transform_to_hourly(X, agg_dict=None):
    """
    Transforms the given dataframe to hourly frequency based on the 'date_forecast' column.
    
    Returns:
    - pd.DataFrame: Dataframe resampled to hourly frequency.
    """
    if agg_dict = None:
        return X.resample('H', on = 'date_forecast').mean()
    return X.resample('H', on = 'date_forecast').agg(agg_dict)

#Remove rows in X_train that has timestamp that does not exist in train_loc, and visa_verca
#e.g missing solar power measurements from 2022-10-21 01:00 - 2022-10-28 21:00
def align_X_y(X_train,y_train):
    """
    Aligns two dataframes based on the index of X and the 'time' column of y,
    ensuring that only rows with matching time values are retained.

    Parameters:
    - X (pd.DataFrame): The first dataframe with time as the index.
    - y (pd.DataFrame): The second dataframe with time in the 'time' column.

    Returns:
    - tuple: A tuple containing the aligned dataframes.
    """
    common_times = X_train.index.intersection(y_train['time'])
    X_aligned = X_train.loc[common_times]
    y_aligned = y_train[y_train['time'].isin(common_times)]

    return X_aligned, y_aligned

SyntaxError: invalid syntax (3996755757.py, line 8)

In [15]:
X_train_a_hourly = transform_to_hourly(X_train_a, agg_dict1) 
X_train_b_hourly = transform_to_hourly(X_train_b, agg_dict1) 
X_train_c_hourly = transform_to_hourly(X_train_c, agg_dict1) 

X_test_a = transform_to_hourly(X_test_estimated_a, agg_dict1) 
X_test_b = transform_to_hourly(X_test_estimated_b, agg_dict1) 
X_test_c = transform_to_hourly(X_test_estimated_c, agg_dict1) 


NameError: name 'mean' is not defined

In [None]:
X_train_a, y_train_a = align_X_y(X_train_a_hourly, train_a)
X_train_b, y_train_b = align_X_y(X_train_b_hourly, train_b)
X_train_c, y_train_c = align_X_y(X_train_c_hourly, train_c)

In [None]:
X_train_a

### 

# Exploratory Data Analysis

## Analysis of Target variable  - Looking at PV_measurement
1. Handle constant measurments over longer periods of time. Likely caused by sensor malfunction, data logging issues, or other external factors.
2. Handle longer periods of missing data:
    - Interpolate 
    - Copy from previous year
    - Copy solar production at missing time from another location

In [None]:
# Time-Series plot of PV_measurement 
def solar_prod_plot(y_train, years_per_plot = 5):
    df = y_train.copy()
    # Get the unique years
    unique_years = df['time'].dt.year.unique()

    # Loop over the years in chunks
    for i in range(0, len(unique_years), years_per_plot):
        subset_years = unique_years[i:i+years_per_plot]

        # Filter the dataframe for the current chunk of years
        subset_df = df[df['time'].dt.year.isin(subset_years)]

        plt.figure(figsize=(15, 6))
        plt.plot(subset_df['time'], subset_df['pv_measurement'])

        title = f"Solar Power Production for {subset_years[0]}"
        if len(subset_years) > 1:
            title += f" to {subset_years[-1]}"

        plt.title(title)
        plt.xlabel("Time")
        plt.ylabel("PV Measurement")
        plt.show();

        
def nan_vals(y_train):
    return y_train[y_train['pv_measurement'].isna()]

def remove_constant_intervals(y_train, low_thresh, upp_thresh):
    """
    Identify and remove intervals of constant PV readings that exceed a specified duration. 
    Constant readings may indicate sensor malfunctions or data logging issues.
    
    Parameters:
    ----------
    y_train : pd.DataFrame
        Dataframe containing the time-series data of solar power production.
    threshold : int
        The minimum duration required for an interval to be considered for removal.
        
    Returns:
    -------
    pd.DataFrame
        The input dataframe with intervals of constant readings (exceeding the duration threshold) removed.
    """
    df = y_train.copy()
    
    # Calculate the difference in production values
    df['diff'] = df['pv_measurement'].diff()

    # Identify where the difference is zero
    df['zero_diff'] = df['diff'].abs() < 1e-5

    # Identify groups of consecutive zero differences
    df['group'] = (df['zero_diff'] != df['zero_diff'].shift()).cumsum()

    # Filter out only the groups with consecutive zero differences
    constant_intervals = df[df['zero_diff']].groupby('group').agg(start=('time', 'min'), 
                                                                  end=('time', 'max'),
                                                                  duration=('time', 'size'))
    
    # Filter intervals based on the threshold
    interval_df_thresh = constant_intervals[(constant_intervals['duration'] > low_thresh) & (constant_intervals['duration'] <upp_thresh)]
    
    # Remove rows from the main dataframe that fall within these intervals
    for _, row in interval_df_thresh.iterrows():
        start_time, end_time = row['start'], row['end']
        df = df[(df['time'] < start_time) | (df['time'] > end_time)]
    
    # Drop the added columns used for calculations
    df.drop(columns=['diff', 'zero_diff', 'group'], inplace=True)
    
    return df, constant_intervals


def get_time_interval(df, start_time = '2020-08-01 00:00:00', end_time = '2021-01-01 00:00:00'):
    # Filter rows based on the time period
    filtered_df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    return filtered_df

**Consider making function that removes groups of constant values BELOW a cerain threshold. For longer periods of time it would make more sense to create new values instead of removing so many rows of data (e.g. by copying from previous years or another location, or interpolate)**

In [None]:
solar_prod_plot(y_train_a,1)

In [None]:
#Removed all constant values with duration > 24 hours
#-> Resulted in very poor model performance Got a public score of 370 on kaggle

#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
#y_train_a_filtered, const_interval_a = remove_constant_intervals(y_train_a,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
#X_train_a, y_train_a = align_X_y(X_train_a, y_train_a_filtered)

#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
#y_train_b_filtered, const_interval_b = remove_constant_intervals(y_train_b,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
#X_train_b, y_train_b = align_X_y(X_train_b, y_train_b_filtered)

#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
#y_train_c_filtered, const_interval_c = remove_constant_intervals(y_train_c,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
#X_train_c, y_train_c = align_X_y(X_train_c, y_train_c_filtered)

## Analysis of Decision Variables - Weather featueres
1. Look at each feature isolated (missing and supsicious values) 
2. Look at features vs solar power production (how can we best capture the relationships in our model?)

In [None]:
def plot_feature(feature_name,seaborn_style = True):
    if seaborn_style:
        sns.set_style("whitegrid")
    else:
        seaborn.reset_defaults()
    fig, axs = plt.subplots(3, 1, figsize=(20, 10), sharex=True)
    X_train_observed_a[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[0], title='Train/Test A', color='red')
    X_train_estimated_a[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[0], title='Train/Test A', color='blue')
    X_test_estimated_a[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[0], title='Train/Test  A', color='green')

    X_train_observed_b[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[1], title='Train/Test  B', color='red')
    X_train_estimated_b[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[1], title='Train/Test  B', color='blue')
    X_test_estimated_b[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[1], title='Train/Test  B', color='green')

    X_train_observed_c[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[2], title='Train/Test  C', color='red')
    X_train_estimated_c[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[2], title='Train/Test  C', color='blue')
    X_test_estimated_c[['date_forecast', feature_name]].set_index('date_forecast').plot(ax=axs[2], title='Train/Test  C', color='green')

In [None]:
feats = list(X_train_observed_a.columns)
plot_feature(feats[2])

In [None]:

#Here we are plotting on the modified dataset
def time_series_plot(feature):
    fig, ax1 = plt.subplots(figsize=(15, 6))
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Solar Power Production', color='tab:blue')
    ax1.plot(y_train_b['time'], y_train_b['pv_measurement'], color='tab:blue', label='Solar Power Production')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()  
    ax2.set_ylabel(feature, color='tab:red')  
    ax2.plot(X_train_b.index, X_train_b[feature], color='tab:red', label=feature)
    ax2.tick_params(axis='y', labelcolor='tab:red')

    fig.tight_layout()
    plt.title(f'Time Series Plot of Solar Power Production and {feature}')
    plt.show()

time_series_plot('direct_rad:W')

# Feature Engineering

In [None]:
#Create time-based features for hour,day, month
def add_time_feat(X):
    X_frame = X.copy()
    X_frame['year'] = X_frame.index.year
    X_frame['month'] = X_frame.index.month
    X_frame['day'] = X_frame.index.day
    X_frame['weekday'] = X_frame.index.weekday
    X_frame['hour'] = X_frame.index.hour
    return X_frame

In [None]:

X_train_a = add_time_feat(X_train_a)
X_test_a = add_time_feat(X_test_a)

X_train_b = add_time_feat(X_train_b)
X_test_b = add_time_feat(X_test_b)

X_train_c = add_time_feat(X_train_c)
X_test_c = add_time_feat(X_test_c)

# Test Model With Henning's Cleaned data

In [None]:
x_train_a1 = pd.read_csv('cleaned_data/A/x_train_a.csv')
x_train_b1 = pd.read_csv('cleaned_data/B/x_train_b.csv')
x_train_c1 = pd.read_csv('cleaned_data/C/x_train_c.csv')

x_test_a1 = pd.read_csv('cleaned_data/A/x_test_a.csv')
x_test_b1 = pd.read_csv('cleaned_data/B/x_test_b.csv')
x_test_c1 = pd.read_csv('cleaned_data/C/x_test_c.csv')

y_train_a1 = pd.read_csv('cleaned_data/A/train_a.csv')
y_train_b1 = pd.read_csv('cleaned_data/B/train_b.csv')
y_train_c1 = pd.read_csv('cleaned_data/C/train_c.csv')


In [None]:
x_train_a1

In [None]:
solar_prod_plot(x_train_a1)

In [None]:
X_train_a

# Build H2O Model 

In [None]:
from h2o.automl import H2OAutoML

def build_model(X_train,y_train,location):
    merged_data = X_train.copy()
    merged_data['pv_measurement'] = y_train['pv_measurement'].values
    
    y = 'pv_measurement'
    x = list(X_train.columns)
    train = h2o.H2OFrame(merged_data)
    
    aml = H2OAutoML(
        max_models = 10,
        max_runtime_secs = 120,
        #exclude_algos =['DeepLearning'],
        seed = 1,
        # stopping_metric ='logloss',
        sort_metric ='mae',
        balance_classes = False,
        project_name ='Project1'
    )

    aml.train(x=x, y=y, training_frame=train)
    
    lb = aml.leaderboard
    leader = aml.leader
    print(lb.head(rows=lb.nrows))
    
    h2o.save_model(leader, path=f'Saved_models/{location.upper()}', force = True)

    return lb,leader

def predict(model, X_test):
    
    test = pd.read_csv('test.csv')
    pred_time_stamps = test['time'].unique()
    X_test = X_test[X_test.index.isin(pred_time_stamps)]
    preds = model.predict(h2o.H2OFrame(X_test))
    
    return preds


In [None]:
test = pd.read_csv('test.csv')
pred_time_stamps = test['time'].unique()
t=X_test_a[X_test_a.index.isin(pred_time_stamps)]

In [None]:
lb_a,model_a = build_model(X_train_a,y_train_a,'A')

In [None]:

lb_b,model_b = build_model(X_train_b,y_train_b,'B')
lb_c,model_c = build_model(X_train_c,y_train_c,'C')


In [None]:
pred_a = predict(model_a,X_test_a)

In [None]:
#Use model to make predictions on test_data
pred_a = predict(model_a,X_test_a)
pred_b = predict(model_b,X_test_b)
pred_c = predict(model_c, X_test_b)

In [None]:
def create_submission(preds_a,preds_b,preds_c,sub_name):
    submission = pd.read_csv('sample_submission.csv')
    predictions = pred_a.rbind(pred_b).rbind(pred_c)
    submission['prediction'] = predictions['predict'].as_data_frame()["predict"].values
    submission.loc[submission['prediction'] < 0, 'prediction'] = 0
    submission.to_csv(f'Submissions/{sub_name}.csv', index=False)
    return submission
    

In [None]:
sub = create_submission(pred_a,pred_b,pred_c,'fourth_sub')
sub

### Analysis of models

In [None]:
model_a_first = h2o.load_model('Saved_models/A/GBM_4_AutoML_1_20231020_170352')
model_b_first = h2o.load_model('Saved_models/B/GBM_8_AutoML_2_20231020_170654')
model_c_first = h2o.load_model('Saved_models/C/GBM_8_AutoML_2_20231020_170654')

In [None]:
pred_a_first = model_a_first.predict(h2o.H2OFrame(t))

In [None]:
lb_a

In [None]:
model_a_first

In [None]:
model_a

In [None]:
y_pred1 = pred_a_first.as_data_frame()
y_pred2 = pred_a.as_data_frame()

In [None]:
plt.figure(figsize=(10, 8))
    
# Scatter plot
plt.scatter(y_pred1['predict'], y_pred2['predict'], alpha=0.5)

# Line of equality (for reference)
plt.plot([y_pred1['predict'].min(), y_pred1['predict'].max()],
         [y_pred2['predict'].min(), y_pred2['predict'].max()],
         color='red', linestyle='--')

# Labels and title
plt.xlabel('Predictions from Model 1')
plt.ylabel('Predictions from Model 2')
plt.title('Comparison of Predictions from Two Models')

# Show plot
plt.grid(True)
plt.show()

In [None]:
model_a

In [None]:
# 2. Feature Importance
def feat_importance(model, n_feats):
    feature_importance = model.varimp(use_pandas=True)
    n_top_feats = feature_importance.iloc[:n_feats,:]
    return n_top_feats

In [None]:
top_12_feats_a = feat_importance(model_a,12)
top_12_feats_a

In [None]:
feat_importance(model_a_first,12)

In [None]:
feat_importance(model_b,12)

In [None]:
feat_importance(model_b_first,12)

In [None]:
feat_importance(model_c,12)

In [None]:
"""pred_df = X_test_b.copy()
pred_df["prediction"] = preds.as_data_frame()["predict"].values


plt.figure(figsize=(10, 6))
plt.plot(pred_df.index.values, pred_df['prediction'], '-o', label='Predictions', color='blue')
plt.title('Predicted Solar Energy Production')
plt.xlabel('Timestamp')
plt.ylabel('Solar Energy Production pv')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()"""

In [None]:
"""import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("notebook")
sns.scatterplot(x='date_forecast', y='prediction', data=pred_df.reset_index(), hue='prediction', palette="coolwarm", size='prediction', sizes=(20, 200))
plt.title("Scatter Plot of Predictions");"""

In [None]:
"""sns.histplot(pred_df['prediction'], bins=10, kde=True, color='dodgerblue')
plt.title("Distribution of Predictions");"""

In [None]:
highly_relevant = [
    'clear_sky_rad:W', 
    'direct_rad:W', 
    'direct_rad_1h:J', 
    'diffuse_rad:W', 
    'diffuse_rad_1h:J',
    'effective_cloud_cover:p', 
    'is_day:idx', 
    'is_in_shadow:idx', 
    'sun_elevation:d', 
    'sun_azimuth:d',
    'total_cloud_cover:p',
    'clear_sky_energy_1h:J'
]

moderately_relevant = [
    'fresh_snow_6h:cm'
    'fresh_snow_12h:cm',
    'fresh_snow_24h:cm',
    'ceiling_height_agl:m', 
    'cloud_base_agl:m', 
    'snow_depth:cm', 
    'fresh_snow_1h:cm', 
    'fresh_snow_3h:cm',
    'wind_speed_10m:ms', 
    'wind_speed_u_10m:ms', 
    'wind_speed_v_10m:ms', 
    'visibility:m'
]

less_relevant = [
    'absolute_humidity_2m:gm3', 
    'dew_point_2m:K', 
    'msl_pressure:hPa', 
    'precip_5min:mm', 
    'rain_water:kgm2',
    'sfc_pressure:hPa', 
    'snow_density:kgm3', 
    'snow_melt_10min:mm',
    'air_density_2m:kgm3'
]

not_included = ['elevation:m',
 'pressure_50m:hPa',
 'super_cooled_liquid_water:kgm2',
 'precip_type_5min:idx',
 'dew_or_rime:idx',
 'snow_water:kgm2',
 'prob_rime:p',
 'snow_drift:idx',
 't_1000hPa:K',
 'wind_speed_w_1000hPa:ms',
 'relative_humidity_1000hPa:p',
 'date_calc',
 'pressure_100m:hPa']


#not_included = list(set(X_train_b.columns)-set(highly_relevant) - set(moderately_relevant)-set(less_relevant))


#len(highly_relevant)+len(moderately_relevant)+len(less_relevant) + len(not_included)