In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
x_train_a = pd.read_csv('cleaned_data/A/x_train_a.csv')
x_train_b = pd.read_csv('cleaned_data/B/x_train_b.csv')
x_train_c = pd.read_csv('cleaned_data/C/x_train_c.csv')

x_test_a = pd.read_csv('cleaned_data/A/x_test_a.csv')
x_test_b = pd.read_csv('cleaned_data/B/x_test_b.csv')
x_test_c = pd.read_csv('cleaned_data/C/x_test_c.csv')

train_a = pd.read_csv('cleaned_data/A/train_a.csv')
train_b = pd.read_csv('cleaned_data/B/train_b.csv')
train_c = pd.read_csv('cleaned_data/C/train_c.csv')

In [3]:
train_a['time'] = pd.to_datetime(train_a['time'])
train_b['time'] = pd.to_datetime(train_b['time'])
train_c['time'] = pd.to_datetime(train_c['time'])

In [4]:
x_test_a = x_test_a.drop(columns = ['date_forecast'])
x_test_b = x_test_b.drop(columns = ['date_forecast'])
x_test_c = x_test_c.drop(columns = ['date_forecast'])


In [5]:
#Remove rows in X_train that has timestamp that does not exist in train_loc, and visa_verca
#e.g missing solar power measurements from 2022-10-21 01:00 - 2022-10-28 21:00
def align_X_y(x_train,y_train, x_date_column='date_forecast', y_date_column='time'):
    """
    Aligns two dataframes based on the 'date_forecast' column of X and the 'time' column of y,
    ensuring that only rows with matching time values are retained.

    Parameters:
    - X (pd.DataFrame): The first dataframe with time in the 'date_forecast'
    - y (pd.DataFrame): The second dataframe with time in the 'time' column.

    Returns:
    - tuple: A tuple containing the aligned dataframes.
    """
    # Convert date columns to datetime format for easier comparison
    x_train[x_date_column] = pd.to_datetime(x_train[x_date_column])
    y_train[y_date_column] = pd.to_datetime(y_train[y_date_column])
    
    # Find common dates
    common_dates = x_train[x_date_column][x_train[x_date_column].isin(y_train[y_date_column])]
    
    # Filter both datasets based on common dates
    x_train_synced = x_train[x_train[x_date_column].isin(common_dates)]
    y_train_synced = y_train[y_train[y_date_column].isin(common_dates)]
    
    return x_train_synced, y_train_synced


# Analysis of Target variable  - Looking at PV_measurement
1. Handle constant measurments over longer periods of time. Likely caused by sensor malfunction, data logging issues, or other external factors.
    - Handeled by removing all constant values lasting more than 24 hours 
2. Add cyclical features 
2. Handle longer periods of missing data:
    - Remove (currently tested)
    - Interpolate 
    - Copy from previous year
    - Copy solar production at missing time from another location

### 1. Handle constant PV measurements 

In [6]:
# Time-Series plot of PV_measurement 

def solar_prod_plot(y_train, resolution='year', chunks=5):
    df = y_train.copy()
    
    # Determine the plotting resolution based on the 'resolution' argument
    # Chunks = number of year/months/days in each plot
    if resolution == 'year':
        unique_values = df['time'].dt.year.unique()
        label = 'Year'
    elif resolution == 'month':
        df['year_month'] = df['time'].dt.to_period('M')
        unique_values = df['year_month'].unique()
        label = 'Month'
    elif resolution == 'week':
        df['year_week'] = df['time'].dt.to_period('W')
        unique_values = df['year_week'].unique()
        label = 'Week'
    elif resolution == 'day':
        df['date'] = df['time'].dt.date
        unique_values = df['date'].unique()
        label = 'Day'
    else:
        raise ValueError("Invalid resolution. Choose from 'year', 'month', 'week', or 'day'.")
    
    # Loop over the unique values in chunks
    for i in range(0, len(unique_values), chunks):
        subset_values = unique_values[i:i+chunks]
        
        if resolution == 'year':
            subset_df = df[df['time'].dt.year.isin(subset_values)]
        elif resolution == 'month':
            subset_df = df[df['year_month'].isin(subset_values)]
        elif resolution == 'week':
            subset_df = df[df['year_week'].isin(subset_values)]
        elif resolution == 'day':
            subset_df = df[df['date'].isin(subset_values)]
        
        plt.figure(figsize=(15, 6))
        plt.plot(subset_df['time'], subset_df['pv_measurement'])

        title = f"Solar Power Production for {label}: {subset_values[0]}"
        if len(subset_values) > 1:
            title += f" to {subset_values[-1]}"

        plt.title(title)
        plt.xlabel("Time")
        plt.ylabel("PV Measurement")
        plt.show()

def remove_constant_intervals(y_train, low_thresh, upp_thresh):
    """
    Identify and remove intervals of constant PV readings that exceed a specified duration. 
    Constant readings may indicate sensor malfunctions or data logging issues.
    
    Parameters:
    ----------
    y_train : pd.DataFrame
        Dataframe containing the time-series data of solar power production.
    threshold : int
        The minimum duration required for an interval to be considered for removal.
        
    Returns:
    -------
    pd.DataFrame
        The input dataframe with intervals of constant readings (exceeding the duration threshold) removed.
    """
    df = y_train.copy()
    
    # Calculate the difference in production values
    df['diff'] = df['pv_measurement'].diff()

    # Identify where the difference is zero
    df['zero_diff'] = df['diff'].abs() < 1e-5

    # Identify groups of consecutive zero differences
    df['group'] = (df['zero_diff'] != df['zero_diff'].shift()).cumsum()

    # Filter out only the groups with consecutive zero differences
    constant_intervals = df[df['zero_diff']].groupby('group').agg(start=('time', 'min'), 
                                                                  end=('time', 'max'),
                                                                  duration=('time', 'size'))
    
    # Filter intervals based on the threshold
    interval_df_thresh = constant_intervals[(constant_intervals['duration'] > low_thresh) & (constant_intervals['duration'] <upp_thresh)]
    
    # Remove rows from the main dataframe that fall within these intervals
    for _, row in interval_df_thresh.iterrows():
        start_time, end_time = row['start'], row['end']
        df = df[(df['time'] < start_time) | (df['time'] > end_time)]
    
    # Drop the added columns used for calculations
    df.drop(columns=['diff', 'zero_diff', 'group'], inplace=True)
    
    return df, constant_intervals


def get_time_interval(df, start_time = '2020-08-01 00:00:00', end_time = '2021-01-01 00:00:00'):
    # Filter rows based on the time period
    filtered_df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    return filtered_df

In [7]:
#Removed all constant values with duration > 24 hours

train_a, const_interval_a = remove_constant_intervals(train_a,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_a, train_a = align_X_y(x_train_a, train_a)

In [8]:
rows_removed_a = np.sum(const_interval_a[const_interval_a['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed_a}')
const_interval_a[const_interval_a['duration']>24]

total number of rows removed 42


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
434,2020-01-04 15:00:00,2020-01-06 08:00:00,42


In [9]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_b, const_interval_b = remove_constant_intervals(train_b,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_b, train_b = align_X_y(x_train_b, train_b)

In [10]:
rows_removed = np.sum(const_interval_b[const_interval_b['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed}')
const_interval_b[const_interval_b['duration']>24]

total number of rows removed 6865


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32,2019-01-14 15:00:00,2019-01-18 11:00:00,93
36,2019-01-19 13:00:00,2019-01-26 08:00:00,164
40,2019-01-27 11:00:00,2019-01-28 13:00:00,27
74,2019-02-10 16:00:00,2019-02-13 07:00:00,64
160,2019-03-23 18:00:00,2019-03-26 06:00:00,61
302,2019-05-31 08:00:00,2019-06-03 12:00:00,77
606,2019-10-28 14:00:00,2019-10-30 22:00:00,57
674,2019-12-01 13:00:00,2019-12-04 08:00:00,68
682,2019-12-07 14:00:00,2019-12-11 08:00:00,91
700,2019-12-18 14:00:00,2019-12-20 09:00:00,44


In [11]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_c, const_interval_c = remove_constant_intervals(train_c,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_c, train_c = align_X_y(x_train_c, train_c)

In [12]:
rows_removed = np.sum(const_interval_c[const_interval_c['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed}')
const_interval_c[const_interval_c['duration']>24]

total number of rows removed 4926


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2019-09-04 10:00:00,2019-09-05 12:00:00,27
180,2019-11-11 12:00:00,2019-11-13 08:00:00,45
230,2019-11-28 15:00:00,2019-12-05 09:00:00,163
240,2019-12-07 14:00:00,2019-12-13 09:00:00,140
256,2019-12-16 14:00:00,2019-12-21 09:00:00,116
276,2019-12-25 13:00:00,2019-12-30 09:00:00,117
290,2020-01-02 14:00:00,2020-01-07 09:00:00,116
340,2020-01-23 15:00:00,2020-01-26 08:00:00,66
376,2020-02-05 14:00:00,2020-02-10 07:00:00,114
414,2020-02-23 17:00:00,2020-03-08 08:00:00,328


In [13]:
merged_a = pd.merge(x_train_a, train_a, left_on='date_forecast', right_on='time', how='inner')
merged_b = pd.merge(x_train_b, train_b, left_on='date_forecast', right_on='time', how='inner')
merged_c = pd.merge(x_train_c, train_c, left_on='date_forecast', right_on='time', how='inner')

### Add Cyclical Features

In [None]:
# Creating cyclical features for hour of the day
def cyclic_hourly(x):
    train_data = x.copy()
    train_data['hour_sin'] = np.sin(2 * np.pi * train_data['hour'] / 24)
    train_data['hour_cos'] = np.cos(2 * np.pi * train_data['hour'] / 24)
    return train_data


# Creating cyclical features for month of the year
def cyclic_monthly(x):
    train_data = x.copy()
    train_data['month_sin'] = np.sin(2 * np.pi * train_data['month'] / 12)
    train_data['month_cos'] = np.cos(2 * np.pi * train_data['month'] / 12)
    return train_data

In [None]:

x_train_a = cyclic_hourly(x_train_a)
x_train_a = cyclic_monthly(x_train_a)

x_test_a = cyclic_hourly(x_test_a)
x_test_a = cyclic_monthly(x_test_a)

x_train_b = cyclic_hourly(x_train_b)
x_train_b = cyclic_monthly(x_train_b)

x_test_b = cyclic_hourly(x_test_b)
x_test_b = cyclic_monthly(x_test_b)

x_train_c = cyclic_hourly(x_train_c)
x_train_c = cyclic_monthly(x_train_c)

x_test_c = cyclic_hourly(x_test_c)
x_test_c = cyclic_monthly(x_test_c)


### Remove outliers during night

In [None]:
def plot_hourly_avg(y_train):
    # Grouping by hour and calculating the average PV measurement for each hour
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour
    hourly_avg = train_data.groupby('hour')['pv_measurement'].mean()

    # Plotting the average PV production for each hour
    plt.figure(figsize=(12, 6))
    hourly_avg.plot(kind='bar', color='skyblue')
    plt.title('Average PV Production by Hour')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Average PV Production')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

def plot_dist_hour(y_train, hour):
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour
    
    # Filtering the data for the given hour
    hour_data = train_data[train_data['hour'] == hour]
    
    # Plotting the distribution of PV measurements for 1 am
    plt.figure(figsize=(12, 6))
    plt.hist(hour_data['pv_measurement'], bins=50, color='teal', alpha=0.7)
    plt.title(f'Distribution of PV Measurements at {hour}')
    plt.xlabel('PV Measurement')
    plt.ylabel('Frequency')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    print(hour_data['pv_measurement'].value_counts())
#train_c[(train_c['time'].dt.hour == 2) &(train_c['pv_measurement'] == 9.8)]

def get_nighttime_stats(y_train,night_start,night_end):
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour

    # Filtering the data for nighttime hours (8 pm to 4 am)
    nighttime_data = train_data[(train_data['hour'] >= night_start) | (train_data['hour'] <= night_end)]

    # Descriptive statistics for nighttime PV measurements
    nighttime_stats = nighttime_data['pv_measurement'].describe()

    # Plotting the distribution of nighttime PV measurements
    plt.figure(figsize=(12, 6))
    plt.hist(nighttime_data['pv_measurement'], bins=50, color='purple', alpha=0.7)
    plt.axvline(nighttime_stats['75%'], color='red', linestyle='dashed', label='75th Percentile')
    plt.axvline(nighttime_stats['max'], color='green', linestyle='dashed', label='Max Value')
    plt.title('Distribution of Nighttime PV Measurements')
    plt.xlabel('PV Measurement')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    print(nighttime_stats)
    
def set_nighttime_to_zero(y_train, night_start,night_end, thresh):
    df = y_train.copy()
    df['hour'] = y_train['time'].dt.hour
    mask = (df['hour'] >= 23) | (df['hour'] <= 3) & (df['pv_measurement'] > thresh)
    df.loc[mask, 'pv_measurement'] = 0
    df = df.drop(columns = ['hour'])
    return df

#train_a[(train_a['time'].dt.hour == 2) &(train_a['pv_measurement'] >0)]
#train_a = set_nighttime_to_zero(train_a,23,3,0)
#train_b = set_nighttime_to_zero(train_b,23,3,0)
#train_c = set_nighttime_to_zero(train_c,23,3,0)
#train_a[(train_a['time'].dt.hour == 2) &(train_a['pv_measurement'] >0)]

In [None]:
def remove_rad_null(merged_df):
    merged_data = merged_df.copy()
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad_1h:J'].fillna(0, inplace=True)
    return merged_data
"""
m_a = remove_rad_null(merged_a)
m_b = remove_rad_null(merged_b)
m_c = remove_rad_null(merged_c)
"""

In [None]:
def get_percentiles_df(merged_df):
    merged_data = merged_df.copy()
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad_1h:J'].fillna(0, inplace=True)

    # Calculate and display percentiles
    percentiles = [50,60,70,80,85,90,95]
    percentile_values_direct_rad= np.percentile(merged_data['direct_rad:W'], percentiles)
    percentile_values_direct_rad_1h = np.percentile(merged_data['direct_rad_1h:J'], percentiles)
    percentile_values_clear_sky_rad = np.percentile(merged_data['clear_sky_rad:W'], percentiles)
    percentile_values_clear_sky_energy = np.percentile(merged_data['clear_sky_energy_1h:J'], percentiles)
    percentile_values_df = pd.DataFrame({
        'Percentile': percentiles,
        'direct_rad:W':percentile_values_direct_rad,
        'direct_rad_1h:J': percentile_values_direct_rad_1h,
        'clear_sky_rad:W': percentile_values_clear_sky_rad,
        'clear_sky_energy_1h:J': percentile_values_clear_sky_energy
        })
    
    return percentile_values_df

In [None]:
percentile_a = get_percentiles_df(merged_a)
percentile_a

In [None]:
def get_anomals(merged_data,feature,percentile): 
    #identify the rows where the "direct_rad:W" column in x_train_a is high
    #but the PV measurement in train_a is zero -> Indicates wrong
    
    percentile_df = get_percentiles_df(merged_data)
    
    # Define a threshold for high solar radiation
    threshold = percentile_df[percentile_df['Percentile']==percentile][feature].values[0],

    # Find rows where 'direct_rad:W' is high but PV measurement is zero
    anomalous_rows = merged_data[(merged_data[feature] > threshold) & (merged_data['pv_measurement'] == 0)]
    
    
    # Display the anomalous rows
    return anomalous_rows

get_anomals(merged_c,'direct_rad:W',90)


In [None]:
merged_a1 = merged_a.copy().drop(get_anomals(merged_a,'clear_sky_rad:W',90).index)
merged_b1 = merged_b.copy().drop(get_anomals(merged_b,'direct_rad:W',90).index)
merged_c1 = merged_c.copy().drop(get_anomals(merged_c,'direct_rad_1h:J',90).index)

In [None]:
def split_dataset(train_data, estimated_column='estimated',val_size = 0.1,val = False):
    """
    Splits the dataset into a training set and a validation set.
    The validation set contains the last half of the rows where observed = 0,
    and the training set contains the rest.

    :param train_data: The original training dataset as a pandas DataFrame.
    :param observed_column: The name of the column that indicates if the row is observed.
    :return: A tuple (training_set, validation_set)
    """
    
    if val: 
        estimated_one = train_data[train_data[estimated_column] == 1]

        #Split the filtered dataset into two
        half_index = len(estimated_one) // 2
        validation_set = estimated_one[half_index:]

        # Combine the first half of observed_zero with the rest of the data where observed != 0
        training_set = pd.concat([train_data[train_data[estimated_column] == 0], estimated_one[:half_index]])
    else:
        split_index = int(train_data.shape[0] * (1-val_size))

        # Split the data
        training_set = train_data.iloc[:split_index]
        validation_set = train_data.iloc[split_index:]

    # Filter rows where observed = 0
   
    return training_set, validation_set


In [None]:
training_set, validation_set = split_dataset(merged_b)
len(training_set)

# Build Catboost model 

In [16]:
def split_dataset(train_data, val_size=0.1, val = False, estimated_column = 'estimated'):
    if val: 
        estimated_one = train_data[train_data[estimated_column] == 1]

        #Split the filtered dataset into two
        half_index = len(estimated_one) // 2
        validation_set = estimated_one[half_index:]

        # Combine the first half of observed_zero with the rest of the data where observed != 0
        training_set = pd.concat([train_data[train_data[estimated_column] == 0], estimated_one[:half_index]])
    else:
        split_index = int(train_data.shape[0] * (1 - val_size))
        training_set = train_data.iloc[:split_index]
        validation_set = train_data.iloc[split_index:]
    return training_set, validation_set

def build_catboost(merged_df, val_size=0.1, randomized=False):
    merged_df = merged_df.drop(columns=['date_forecast','time'])
    if randomized:
        X = merged_df.drop(columns=['pv_measurement'])
        y = merged_df['pv_measurement']
        X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42)
    else:
        training_set, validation_set = split_dataset(merged_df, val_size, True)
        X_train = training_set.drop(columns=['pv_measurement'])
        y_train = training_set['pv_measurement']
        X_validation = validation_set.drop(columns=['pv_measurement'])
        y_validation = validation_set['pv_measurement']
    
    catboost_model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=6,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=42,
        verbose=200
    )
    
    catboost_model.fit(X_train, y_train, eval_set=(X_validation, y_validation), use_best_model=True)
    return catboost_model


In [17]:
model_a = build_catboost(merged_a,0.1)

0:	learn: 600.5894516	test: 558.0483834	best: 558.0483834 (0)	total: 67.2ms	remaining: 1m 7s
200:	learn: 185.5510441	test: 173.5042243	best: 173.5042243 (200)	total: 2.22s	remaining: 8.82s
400:	learn: 173.2445959	test: 168.7063903	best: 168.6684925 (388)	total: 4.34s	remaining: 6.49s
600:	learn: 166.1482163	test: 167.5894432	best: 166.5839045 (435)	total: 6.4s	remaining: 4.25s
800:	learn: 159.1514533	test: 167.1701680	best: 166.5839045 (435)	total: 8.42s	remaining: 2.09s
999:	learn: 154.7516374	test: 167.3482256	best: 166.5839045 (435)	total: 10.4s	remaining: 0us

bestTest = 166.5839045
bestIteration = 435

Shrink model to first 436 iterations.


In [18]:
model_b = build_catboost(merged_b,0.1)

0:	learn: 101.0737028	test: 91.9392713	best: 91.9392713 (0)	total: 15.1ms	remaining: 15.1s
200:	learn: 25.2960206	test: 29.8616912	best: 29.7580680 (196)	total: 1.91s	remaining: 7.58s
400:	learn: 23.6623041	test: 30.1180718	best: 29.7580680 (196)	total: 3.74s	remaining: 5.59s
600:	learn: 22.7529511	test: 30.1512024	best: 29.7580680 (196)	total: 5.56s	remaining: 3.69s
800:	learn: 22.0695991	test: 30.3945765	best: 29.7580680 (196)	total: 7.4s	remaining: 1.84s
999:	learn: 21.2544385	test: 30.4785549	best: 29.7580680 (196)	total: 9.14s	remaining: 0us

bestTest = 29.75806801
bestIteration = 196

Shrink model to first 197 iterations.


In [19]:
model_c = build_catboost(merged_c,0.2)

0:	learn: 91.5728187	test: 74.2043316	best: 74.2043316 (0)	total: 16.2ms	remaining: 16.2s
200:	learn: 21.8707706	test: 35.5554436	best: 34.3911878 (19)	total: 1.75s	remaining: 6.94s
400:	learn: 20.2638081	test: 35.7763126	best: 34.3911878 (19)	total: 3.38s	remaining: 5.04s
600:	learn: 19.3393926	test: 35.5532212	best: 34.3911878 (19)	total: 5.01s	remaining: 3.33s
800:	learn: 18.4469416	test: 35.8884802	best: 34.3911878 (19)	total: 6.61s	remaining: 1.64s
999:	learn: 17.6737007	test: 35.9224432	best: 34.3911878 (19)	total: 8.19s	remaining: 0us

bestTest = 34.39118778
bestIteration = 19

Shrink model to first 20 iterations.


In [None]:
def get_feat_importance(model):
    feats = {'feature':merged_a.drop(columns =['pv_measurement']).columns,
         'importance':model.get_feature_importance()}
    df = pd.DataFrame(feats).sort_values('importance',ascending = False)
    return df

In [None]:
get_feat_importance(model_c)

In [22]:
x_test_a

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,year,month,day,hour,estimated
0,4.325,1.28675,912.7000,0.00,0.000,1061.5500,0.0,271.65002,0.000,0.00,0.000,0.0,74.950,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.675,0.0,0.0,1000.550,1006.800,0.0,0.0,80.275,1013.100,,0.0,0.0,0.0,0.0,16.02650,-10.54100,0.000,273.80000,74.950,29907.500,3.950,2.100,3.350,0.0,2023,5,1,0,1
1,4.275,1.28600,1482.1000,0.00,0.000,1075.1001,0.0,271.45000,0.000,0.00,0.000,0.0,77.475,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.150,0.0,0.0,1000.050,1006.300,0.0,0.0,79.825,1012.600,,0.0,0.0,0.0,0.0,30.49725,-7.89450,0.000,273.80000,77.475,29519.074,3.825,1.925,3.300,0.0,2023,5,1,1,1
2,4.150,1.28375,1791.3000,0.00,0.000,1200.4000,0.0,271.05000,0.000,0.00,0.000,0.0,88.100,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1012.675,0.0,0.0,999.500,1005.800,0.0,0.0,78.000,1012.050,,0.0,0.0,0.0,0.0,44.51725,-3.81550,0.000,273.84998,88.100,31009.125,3.650,1.750,3.200,0.0,2023,5,1,2,1
3,4.025,1.28200,2312.8750,40497.70,11.675,1179.8500,0.0,270.65000,9.375,67380.91,2.100,15061.4,68.600,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1012.175,0.0,0.0,998.975,1005.225,0.0,0.0,75.625,1011.525,,0.0,0.0,0.0,0.0,58.08300,1.41250,0.000,273.90000,68.600,34552.500,3.500,1.450,3.150,0.0,2023,5,1,3,1
4,3.900,1.28100,2198.2998,566994.40,76.875,920.0500,0.0,270.37500,47.400,408838.80,25.450,198284.8,66.300,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1011.725,0.0,0.0,998.550,1004.750,0.0,0.0,74.225,1011.050,,0.0,0.0,0.0,0.0,71.34100,7.46850,0.000,273.92500,66.300,35483.875,3.325,1.300,3.050,0.0,2023,5,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,8.350,1.19725,3638.9000,1908372.80,85.100,2013.7500,0.0,281.57500,33.625,675098.20,13.425,203853.0,85.575,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,992.100,0.0,0.0,979.600,985.425,0.0,0.0,69.325,991.325,,0.0,0.0,0.0,0.0,306.68700,8.15300,0.100,287.02500,86.150,44056.375,2.450,2.075,-1.350,0.0,2023,7,3,19,1
716,8.525,1.20050,3552.1000,737342.70,24.800,1610.9000,0.0,281.85000,14.325,345284.50,3.550,122263.5,74.600,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,992.550,0.0,0.0,980.025,985.900,0.0,0.0,72.900,991.800,,0.0,0.0,0.0,0.0,319.70400,3.26800,0.000,286.60000,75.325,44017.176,2.450,2.100,-1.250,0.0,2023,7,3,20,1
717,8.825,1.20450,2315.0000,149717.31,1.275,1622.8000,0.0,282.32500,1.300,112676.60,0.000,25639.6,76.125,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,992.900,0.0,0.0,980.350,986.250,0.0,0.0,78.050,992.150,,0.0,0.0,0.0,0.0,333.04000,-0.44325,0.000,286.10000,76.850,43302.050,2.575,2.150,-1.400,0.0,2023,7,3,21,1
718,9.025,1.20700,2202.8000,1440.20,0.000,1767.5500,0.0,282.67502,0.000,9402.90,0.000,0.0,98.225,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,993.225,0.0,0.0,980.700,986.600,0.0,0.0,81.750,992.525,,0.0,0.0,0.0,0.0,346.68600,-2.75050,0.075,285.55000,98.325,40505.850,2.250,1.800,-1.350,0.0,2023,7,3,22,1


In [20]:
pred_a = model_a.predict(x_test_a)
pred_b = model_b.predict(x_test_b)
pred_c = model_c.predict(x_test_c)

In [21]:
def create_sub(pred_a,pred_b,pred_c):
    submission = pd.read_csv('sample_submission.csv')
    submission['prediction'] = np.concatenate([pred_a,pred_b,pred_c])
    submission.loc[submission['prediction'] < 0, 'prediction'] = 0
    return submission


In [22]:
sub = create_sub(pred_a,pred_b,pred_c)
sub.to_csv(f'Submissions/fifthCatboost.csv', index=False)

In [None]:



def build_model(X_train,y_train,location):
    
    merged_data = X_train.copy()
    merged_data['pv_measurement'] = y_train['pv_measurement'].values
    
    y = 'pv_measurement'
    x = list(X_train.columns)
    train = h2o.H2OFrame(merged_data)
    
    aml = H2OAutoML(
        max_models = 10,
        max_runtime_secs = 60,
        exclude_algos =['DeepLearning'],
        seed = 1,
        # stopping_metric ='logloss',
        sort_metric ='mae',
        balance_classes = False,
        project_name = location
    )

    aml.train(x=x, y=y, training_frame=train)
    
    lb = aml.leaderboard
    leader = aml.leader
    print(lb.head(rows=lb.nrows))
    
    h2o.save_model(leader, path=f'Saved_models/{location.upper()}', force = True)

    return lb,leader

In [None]:
lb_a,model_a = build_model(x_train_a,train_a,'A')
lb_b,model_b = build_model(x_train_b,train_b,'B')
lb_c,model_c = build_model(x_train_c,train_c,'C')

In [None]:
preds_a = model_a.predict(h2o.H2OFrame(x_test_a))
preds_b = model_b.predict(h2o.H2OFrame(x_test_b))
preds_c = model_c.predict(h2o.H2OFrame(x_test_c))

In [None]:
model_a_first = h2o.load_model('Saved_models/A/GBM_4_AutoML_1_20231020_170352')
model_b_first = h2o.load_model('Saved_models/B/GBM_8_AutoML_2_20231020_170654')
model_c_first = h2o.load_model('Saved_models/C/GBM_8_AutoML_2_20231020_170654')

In [None]:
preds_a_original = model_a_first.predict(h2o.H2OFrame(X_test_a))
preds_b_original = model_b_first.predict(h2o.H2OFrame(X_test_b))
preds_c_original = model_c_first.predict(h2o.H2OFrame(X_test_c))

In [None]:
def compare_two_preds(pred1,pred2):
    pred1 = preds_a_original.as_data_frame()
    pred2 = preds_a2.as_data_frame()

    plt.figure(figsize=(10, 8))

    # Scatter plot
    plt.scatter(y_pred1['predict'], y_pred2['predict'], alpha=0.5)

    # Line of equality (for reference)
    plt.plot([y_pred1['predict'].min(), y_pred1['predict'].max()],
             [y_pred2['predict'].min(), y_pred2['predict'].max()],
             color='red', linestyle='--')

    # Labels and title
    plt.xlabel('Predictions from First Model')
    plt.ylabel('Predictions from New model')
    plt.title('Comparison of Predictions from Two Models')

    # Show plot
    plt.grid(True)
    plt.show()

In [None]:
compare_two_preds(preds_a,preds_a_original)

In [None]:
# 2. Feature Importance
def feat_importance(model, n_feats):
    feature_importance = model.varimp(use_pandas=True)
    n_top_feats = feature_importance.iloc[:n_feats,:]
    return n_top_feats

print(feat_importance(model_a, 15))

In [None]:
def plot_prediction(preds):
    test = pd.read_csv('test.csv')
    predictions= preds['predict'].as_data_frame()
    predictions['time'] = test['time'].unique()
    fig, ax1 = plt.subplots(figsize=(15, 6))
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Prediction', color='tab:blue')
    ax1.plot(predictions['time'], predictions['predict'], color='tab:blue', label='Solar Power Production')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    fig.tight_layout()
    plt.title(f'Time Series Plot of prediction')
    plt.show()

In [None]:
plot_prediction(preds_a)

In [None]:
plot_prediction(preds_c_original)

In [None]:
def create_sub(preds_a,preds_b,preds_c):
    submission = pd.read_csv('sample_submission.csv')
    predictions = preds_a.rbind(preds_b).rbind(preds_c)
    predictions_df = predictions['predict'].as_data_frame()
    predictions_df.loc[predictions_df['predict'] < 0, 'predict'] = 0
    submission['prediction'] = predictions_df['predict']
    return submission 


In [None]:
sub1 = create_sub(preds_a,preds_b,preds_c)


In [None]:
sub1.to_csv(f'Submissions/removedOutliersNight.csv', index=False)

In [None]:
  """# Plot the distribution of "direct_rad:W"
    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['direct_rad:W'], bins=50, kde=True)
    plt.title('Distribution of "direct_rad:W"')
    plt.xlabel('Direct Radiation (W)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['clear_sky_rad:W'], bins=50, kde=True)
    plt.title('Distribution of "clear_sky_rad:W"')
    plt.xlabel('Direct Radiation (W)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['direct_rad_1h:J'], bins=50, kde=True)
    plt.title('Distribution of "direct_rad_1h:J"')
    plt.xlabel('Radiation 1h(J)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['clear_sky_energy_1h:J'], bins=50, kde=True)
    plt.title('Distribution of "clear_sky_energy_1h:J"')
    plt.xlabel('Radiation 1h(J)')
    plt.ylabel('Frequency')
    plt.show()"""

    