In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
x_train_a = pd.read_csv('cleaned_data/A/x_train_a.csv')
x_train_b = pd.read_csv('cleaned_data/B/x_train_b.csv')
x_train_c = pd.read_csv('cleaned_data/C/x_train_c.csv')

x_test_a = pd.read_csv('cleaned_data/A/x_test_a.csv')
x_test_b = pd.read_csv('cleaned_data/B/x_test_b.csv')
x_test_c = pd.read_csv('cleaned_data/C/x_test_c.csv')

train_a = pd.read_csv('cleaned_data/A/train_a.csv')
train_b = pd.read_csv('cleaned_data/B/train_b.csv')
train_c = pd.read_csv('cleaned_data/C/train_c.csv')

In [3]:
"""
x_train_a = pd.read_csv('cleaned_data_Henning/A/x_train_a.csv')
x_train_b = pd.read_csv('cleaned_data_Henning/B/x_train_b.csv')
x_train_c = pd.read_csv('cleaned_data_Henning/C/x_train_c.csv')

x_test_a = pd.read_csv('cleaned_data_Henning/A/x_test_a.csv')
x_test_b = pd.read_csv('cleaned_data_Henning/B/x_test_b.csv')
x_test_c = pd.read_csv('cleaned_data_Henning/C/x_test_c.csv')

train_a = pd.read_csv('cleaned_data_Henning/A/train_a.csv')
train_b = pd.read_csv('cleaned_data_Henning/B/train_b.csv')
train_c = pd.read_csv('cleaned_data_Henning/C/train_c.csv')
"""

"\nx_train_a = pd.read_csv('cleaned_data_Henning/A/x_train_a.csv')\nx_train_b = pd.read_csv('cleaned_data_Henning/B/x_train_b.csv')\nx_train_c = pd.read_csv('cleaned_data_Henning/C/x_train_c.csv')\n\nx_test_a = pd.read_csv('cleaned_data_Henning/A/x_test_a.csv')\nx_test_b = pd.read_csv('cleaned_data_Henning/B/x_test_b.csv')\nx_test_c = pd.read_csv('cleaned_data_Henning/C/x_test_c.csv')\n\ntrain_a = pd.read_csv('cleaned_data_Henning/A/train_a.csv')\ntrain_b = pd.read_csv('cleaned_data_Henning/B/train_b.csv')\ntrain_c = pd.read_csv('cleaned_data_Henning/C/train_c.csv')\n"

In [4]:
def add_week_feat(df):

    df['date_forecast'] = pd.to_datetime(df['date_forecast'])
    
    # Extract week number
    df['week'] = df['date_forecast'].dt.isocalendar().week

    return df

"""
x_train_a = add_week_feat(x_train_a)
x_train_b = add_week_feat(x_train_b)
x_train_c = add_week_feat(x_train_c)

x_test_a = add_week_feat(x_test_a)
x_test_b = add_week_feat(x_test_b)
x_test_c = add_week_feat(x_test_c)
"""

'\nx_train_a = add_week_feat(x_train_a)\nx_train_b = add_week_feat(x_train_b)\nx_train_c = add_week_feat(x_train_c)\n\nx_test_a = add_week_feat(x_test_a)\nx_test_b = add_week_feat(x_test_b)\nx_test_c = add_week_feat(x_test_c)\n'

In [5]:
train_a['time'] = pd.to_datetime(train_a['time'])
train_b['time'] = pd.to_datetime(train_b['time'])
train_c['time'] = pd.to_datetime(train_c['time'])

In [6]:
x_test_a = x_test_a.drop(columns = ['date_forecast'])
x_test_b = x_test_b.drop(columns = ['date_forecast'])
x_test_c = x_test_c.drop(columns = ['date_forecast'])


In [7]:
#Remove rows in X_train that has timestamp that does not exist in train_loc, and visa_verca
#e.g missing solar power measurements from 2022-10-21 01:00 - 2022-10-28 21:00
def align_X_y(x_train,y_train, x_date_column='date_forecast', y_date_column='time'):
    """
    Aligns two dataframes based on the 'date_forecast' column of X and the 'time' column of y,
    ensuring that only rows with matching time values are retained.

    Parameters:
    - X (pd.DataFrame): The first dataframe with time in the 'date_forecast'
    - y (pd.DataFrame): The second dataframe with time in the 'time' column.

    Returns:
    - tuple: A tuple containing the aligned dataframes.
    """
    # Convert date columns to datetime format for easier comparison
    x_train[x_date_column] = pd.to_datetime(x_train[x_date_column])
    y_train[y_date_column] = pd.to_datetime(y_train[y_date_column])
    
    # Find common dates
    common_dates = x_train[x_date_column][x_train[x_date_column].isin(y_train[y_date_column])]
    
    # Filter both datasets based on common dates
    x_train_synced = x_train[x_train[x_date_column].isin(common_dates)]
    y_train_synced = y_train[y_train[y_date_column].isin(common_dates)]
    
    return x_train_synced, y_train_synced


# Analysis of Target variable  - Looking at PV_measurement
1. Handle constant measurments over longer periods of time. Likely caused by sensor malfunction, data logging issues, or other external factors.
    - Handeled by removing all constant values lasting more than 24 hours 
2. Add cyclical features 
2. Handle longer periods of missing data:
    - Remove (currently tested)
    - Interpolate 
    - Copy from previous year
    - Copy solar production at missing time from another location

### 1. Handle constant PV measurements 

In [8]:
# Time-Series plot of PV_measurement 

def solar_prod_plot(y_train, resolution='year', chunks=5):
    df = y_train.copy()
    
    # Determine the plotting resolution based on the 'resolution' argument
    # Chunks = number of year/months/days in each plot
    if resolution == 'year':
        unique_values = df['time'].dt.year.unique()
        label = 'Year'
    elif resolution == 'month':
        df['year_month'] = df['time'].dt.to_period('M')
        unique_values = df['year_month'].unique()
        label = 'Month'
    elif resolution == 'week':
        df['year_week'] = df['time'].dt.to_period('W')
        unique_values = df['year_week'].unique()
        label = 'Week'
    elif resolution == 'day':
        df['date'] = df['time'].dt.date
        unique_values = df['date'].unique()
        label = 'Day'
    else:
        raise ValueError("Invalid resolution. Choose from 'year', 'month', 'week', or 'day'.")
    
    # Loop over the unique values in chunks
    for i in range(0, len(unique_values), chunks):
        subset_values = unique_values[i:i+chunks]
        
        if resolution == 'year':
            subset_df = df[df['time'].dt.year.isin(subset_values)]
        elif resolution == 'month':
            subset_df = df[df['year_month'].isin(subset_values)]
        elif resolution == 'week':
            subset_df = df[df['year_week'].isin(subset_values)]
        elif resolution == 'day':
            subset_df = df[df['date'].isin(subset_values)]
        
        plt.figure(figsize=(15, 6))
        plt.plot(subset_df['time'], subset_df['pv_measurement'])

        title = f"Solar Power Production for {label}: {subset_values[0]}"
        if len(subset_values) > 1:
            title += f" to {subset_values[-1]}"

        plt.title(title)
        plt.xlabel("Time")
        plt.ylabel("PV Measurement")
        plt.show()

def remove_constant_intervals(y_train, low_thresh, upp_thresh):
    """
    Identify and remove intervals of constant PV readings that exceed a specified duration. 
    Constant readings may indicate sensor malfunctions or data logging issues.
    
    Parameters:
    ----------
    y_train : pd.DataFrame
        Dataframe containing the time-series data of solar power production.
    threshold : int
        The minimum duration required for an interval to be considered for removal.
        
    Returns:
    -------
    pd.DataFrame
        The input dataframe with intervals of constant readings (exceeding the duration threshold) removed.
    """
    df = y_train.copy()
    
    # Calculate the difference in production values
    df['diff'] = df['pv_measurement'].diff()

    # Identify where the difference is zero
    df['zero_diff'] = df['diff'].abs() < 1e-5

    # Identify groups of consecutive zero differences
    df['group'] = (df['zero_diff'] != df['zero_diff'].shift()).cumsum()

    # Filter out only the groups with consecutive zero differences
    constant_intervals = df[df['zero_diff']].groupby('group').agg(start=('time', 'min'), 
                                                                  end=('time', 'max'),
                                                                  duration=('time', 'size'))
    
    # Filter intervals based on the threshold
    interval_df_thresh = constant_intervals[(constant_intervals['duration'] > low_thresh) & (constant_intervals['duration'] <upp_thresh)]
    
    # Remove rows from the main dataframe that fall within these intervals
    for _, row in interval_df_thresh.iterrows():
        start_time, end_time = row['start'], row['end']
        df = df[(df['time'] < start_time) | (df['time'] > end_time)]
    
    # Drop the added columns used for calculations
    df.drop(columns=['diff', 'zero_diff', 'group'], inplace=True)
    
    return df, constant_intervals


def get_time_interval(df, start_time = '2020-08-01 00:00:00', end_time = '2021-01-01 00:00:00'):
    # Filter rows based on the time period
    filtered_df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    return filtered_df

In [9]:
#Removed all constant values with duration > 24 hours

train_a, const_interval_a = remove_constant_intervals(train_a,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_a, train_a = align_X_y(x_train_a, train_a)

In [10]:
rows_removed_a = np.sum(const_interval_a[const_interval_a['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed_a}')
const_interval_a[const_interval_a['duration']>24]

total number of rows removed 42


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
434,2020-01-04 15:00:00,2020-01-06 08:00:00,42


In [11]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_b, const_interval_b = remove_constant_intervals(train_b,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_b, train_b = align_X_y(x_train_b, train_b)

In [12]:
rows_removed = np.sum(const_interval_b[const_interval_b['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed}')
const_interval_b[const_interval_b['duration']>24]

total number of rows removed 6865


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32,2019-01-14 15:00:00,2019-01-18 11:00:00,93
36,2019-01-19 13:00:00,2019-01-26 08:00:00,164
40,2019-01-27 11:00:00,2019-01-28 13:00:00,27
74,2019-02-10 16:00:00,2019-02-13 07:00:00,64
160,2019-03-23 18:00:00,2019-03-26 06:00:00,61
302,2019-05-31 08:00:00,2019-06-03 12:00:00,77
606,2019-10-28 14:00:00,2019-10-30 22:00:00,57
674,2019-12-01 13:00:00,2019-12-04 08:00:00,68
682,2019-12-07 14:00:00,2019-12-11 08:00:00,91
700,2019-12-18 14:00:00,2019-12-20 09:00:00,44


In [13]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_c, const_interval_c = remove_constant_intervals(train_c,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_c, train_c = align_X_y(x_train_c, train_c)

In [14]:
rows_removed = np.sum(const_interval_c[const_interval_c['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed}')
const_interval_c[const_interval_c['duration']>24]

total number of rows removed 4926


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2019-09-04 10:00:00,2019-09-05 12:00:00,27
180,2019-11-11 12:00:00,2019-11-13 08:00:00,45
230,2019-11-28 15:00:00,2019-12-05 09:00:00,163
240,2019-12-07 14:00:00,2019-12-13 09:00:00,140
256,2019-12-16 14:00:00,2019-12-21 09:00:00,116
276,2019-12-25 13:00:00,2019-12-30 09:00:00,117
290,2020-01-02 14:00:00,2020-01-07 09:00:00,116
340,2020-01-23 15:00:00,2020-01-26 08:00:00,66
376,2020-02-05 14:00:00,2020-02-10 07:00:00,114
414,2020-02-23 17:00:00,2020-03-08 08:00:00,328


### Merge x_train and y_train

In [15]:
merged_a = pd.merge(x_train_a, train_a, left_on='date_forecast', right_on='time', how='inner')
merged_b = pd.merge(x_train_b, train_b, left_on='date_forecast', right_on='time', how='inner')
merged_c = pd.merge(x_train_c, train_c, left_on='date_forecast', right_on='time', how='inner')

In [None]:
#Here we are plotting on the modified dataset
def time_series_plot(feature,merged_data):
    fig, ax1 = plt.subplots(figsize=(15, 6))
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Solar Power Production', color='tab:blue')
    ax1.plot(merged_data['time'], merged_data['pv_measurement'], color='tab:blue', label='Solar Power Production')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    ax2 = ax1.twinx()  
    ax2.set_ylabel(feature, color='tab:red')  
    ax2.plot(merged_data['date_forecast'], merged_data[feature], color='tab:red', label=feature)
    ax2.tick_params(axis='y', labelcolor='tab:red')

    fig.tight_layout()
    plt.title(f'Time Series Plot of Solar Power Production and {feature}')
    plt.show()

### Add avg pv_feature

In [None]:
import pandas as pd

def add_average_pv_feature(merged_df, test_df,time_group):
    df = merged_df.copy()
    test_df = test_df.copy()
    # Group by year, month, date, and hour and calculate the mean PV measurement
    average_pv = df.groupby(time_group)['pv_measurement'].mean().reset_index()
    average_pv = average_pv.rename(columns={'pv_measurement': 'average_pv_measurement'})

    # Print for debugging


    # Merge the average PV measurements back into the original dataframe
    df = pd.merge(df, average_pv, on=time_group, how='left')
    test_df = pd.merge(test_df, average_pv, on= time_group, how='left')
    
    # Print for debugging
    return df,test_df



In [None]:
time_group_1 = ['month', 'day', 'hour']
time_group_2 = 'week'
time_group_3 = 'month'

In [None]:
merged_a_avg, x_test_a_avg = add_average_pv_feature(merged_a,x_test_a,time_group_1)
merged_b_avg, x_test_b_avg = add_average_pv_feature(merged_b,x_test_b, time_group_1)
merged_c_avg, x_test_c_avg = add_average_pv_feature(merged_c,x_test_c,time_group_1)

merged_a_avg[(merged_a_avg['month']==6) & (merged_a_avg['day']==4) & (merged_a_avg['hour']==16)][['year','month','week','day','hour','pv_measurement','average_pv_measurement']]

### Add lag features

In [None]:
def add_lag_feature(data, lag_hours, column_name='pv_measurement'):
    """
    Add lag features to the dataset.

    Parameters:
    data (pd.DataFrame): The original dataset.
    lag_hours (int): The number of hours to lag.
    column_name (str): The name of the column to create the lag feature for.

    Returns:
    pd.DataFrame: The dataset with the new lag feature.
    """

    # Create the lag feature
    df = data.copy()
    lag_feature_name = f"{column_name}_lag_{lag_hours}h"
    df[lag_feature_name] = df[column_name].shift(lag_hours)

    return df

In [None]:
laged_a = add_lag_feature(merged_a,24)
laged_b = add_lag_feature(merged_b,24)
laged_c = add_lag_feature(merged_c,24)

x_test_a_laged = x_test_a.copy()
x_test_b_laged = x_test_b.copy()
x_test_c_laged = x_test_c.copy()


# You can add an empty column for the lag feature in your test set:
x_test_a_laged[f'pv_measurement_lag_{1}h'] = None
x_test_b_laged[f'pv_measurement_lag_{1}h'] = None
x_test_c_laged[f'pv_measurement_lag_{1}h'] = None

### Handle NaN values

In [None]:
merged_a['ceiling_height_agl:m'] = merged_a['ceiling_height_agl:m'].fillna(value = 0)
merged_b['ceiling_height_agl:m'] = merged_b['ceiling_height_agl:m'].fillna(value = 0)
merged_c['ceiling_height_agl:m'] = merged_c['ceiling_height_agl:m'].fillna(value = 0)

### Add Cyclical Features

In [None]:
# Creating cyclical features for hour of the day
def cyclic_hourly(x):
    train_data = x.copy()
    train_data['hour_sin'] = np.sin(2 * np.pi * train_data['hour'] / 24)
    train_data['hour_cos'] = np.cos(2 * np.pi * train_data['hour'] / 24)
    return train_data


# Creating cyclical features for month of the year
def cyclic_monthly(x):
    train_data = x.copy()
    train_data['month_sin'] = np.sin(2 * np.pi * train_data['month'] / 12)
    train_data['month_cos'] = np.cos(2 * np.pi * train_data['month'] / 12)
    return train_data

"""
x_train_a = cyclic_hourly(x_train_a)
x_train_a = cyclic_monthly(x_train_a)

x_test_a = cyclic_hourly(x_test_a)
x_test_a = cyclic_monthly(x_test_a)

x_train_b = cyclic_hourly(x_train_b)
x_train_b = cyclic_monthly(x_train_b)

x_test_b = cyclic_hourly(x_test_b)
x_test_b = cyclic_monthly(x_test_b)

x_train_c = cyclic_hourly(x_train_c)
x_train_c = cyclic_monthly(x_train_c)

x_test_c = cyclic_hourly(x_test_c)
x_test_c = cyclic_monthly(x_test_c)
"""

### Remove outliers during night

In [None]:
def plot_hourly_avg(y_train):
    # Grouping by hour and calculating the average PV measurement for each hour
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour
    hourly_avg = train_data.groupby('hour')['pv_measurement'].mean()

    # Plotting the average PV production for each hour
    plt.figure(figsize=(12, 6))
    hourly_avg.plot(kind='bar', color='skyblue')
    plt.title('Average PV Production by Hour')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Average PV Production')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

def plot_dist_hour(y_train, hour):
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour
    
    # Filtering the data for the given hour
    hour_data = train_data[train_data['hour'] == hour]
    
    # Plotting the distribution of PV measurements for 1 am
    plt.figure(figsize=(12, 6))
    plt.hist(hour_data['pv_measurement'], bins=50, color='teal', alpha=0.7)
    plt.title(f'Distribution of PV Measurements at {hour}')
    plt.xlabel('PV Measurement')
    plt.ylabel('Frequency')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    print(hour_data['pv_measurement'].value_counts())
#train_c[(train_c['time'].dt.hour == 2) &(train_c['pv_measurement'] == 9.8)]

def get_nighttime_stats(y_train,night_start,night_end):
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour

    # Filtering the data for nighttime hours (8 pm to 4 am)
    nighttime_data = train_data[(train_data['hour'] >= night_start) | (train_data['hour'] <= night_end)]

    # Descriptive statistics for nighttime PV measurements
    nighttime_stats = nighttime_data['pv_measurement'].describe()

    # Plotting the distribution of nighttime PV measurements
    plt.figure(figsize=(12, 6))
    plt.hist(nighttime_data['pv_measurement'], bins=50, color='purple', alpha=0.7)
    plt.axvline(nighttime_stats['75%'], color='red', linestyle='dashed', label='75th Percentile')
    plt.axvline(nighttime_stats['max'], color='green', linestyle='dashed', label='Max Value')
    plt.title('Distribution of Nighttime PV Measurements')
    plt.xlabel('PV Measurement')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    print(nighttime_stats)
    
def set_nighttime_to_zero(y_train, night_start,night_end, thresh):
    df = y_train.copy()
    df['hour'] = y_train['time'].dt.hour
    mask = (df['hour'] >= 23) | (df['hour'] <= 3) & (df['pv_measurement'] > thresh)
    df.loc[mask, 'pv_measurement'] = 0
    df = df.drop(columns = ['hour'])
    return df

#train_a[(train_a['time'].dt.hour == 2) &(train_a['pv_measurement'] >0)]
#train_a = set_nighttime_to_zero(train_a,23,3,0)
#train_b = set_nighttime_to_zero(train_b,23,3,0)
#train_c = set_nighttime_to_zero(train_c,23,3,0)
#train_a[(train_a['time'].dt.hour == 2) &(train_a['pv_measurement'] >0)]

### Remove rows with high rad values and zero PV 

In [None]:
def remove_rad_null(merged_df):
    merged_data = merged_df.copy()
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad_1h:J'].fillna(0, inplace=True)
    return merged_data
"""
m_a = remove_rad_null(merged_a)
m_b = remove_rad_null(merged_b)
m_c = remove_rad_null(merged_c)
"""

def get_percentiles_df(merged_df):
    merged_data = merged_df.copy()
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad_1h:J'].fillna(0, inplace=True)

    # Calculate and display percentiles
    percentiles = [50,60,70,80,85,90,95]
    percentile_values_direct_rad= np.percentile(merged_data['direct_rad:W'], percentiles)
    percentile_values_direct_rad_1h = np.percentile(merged_data['direct_rad_1h:J'], percentiles)
    percentile_values_clear_sky_rad = np.percentile(merged_data['clear_sky_rad:W'], percentiles)
    percentile_values_clear_sky_energy = np.percentile(merged_data['clear_sky_energy_1h:J'], percentiles)
    percentile_values_df = pd.DataFrame({
        'Percentile': percentiles,
        'direct_rad:W':percentile_values_direct_rad,
        'direct_rad_1h:J': percentile_values_direct_rad_1h,
        'clear_sky_rad:W': percentile_values_clear_sky_rad,
        'clear_sky_energy_1h:J': percentile_values_clear_sky_energy
        })
    
    return percentile_values_df

def get_anomals(merged_data,feature,percentile): 
    #identify the rows where the "direct_rad:W" column in x_train_a is high
    #but the PV measurement in train_a is zero -> Indicates wrong
    
    percentile_df = get_percentiles_df(merged_data)
    
    # Define a threshold for high solar radiation
    threshold = percentile_df[percentile_df['Percentile']==percentile][feature].values[0],

    # Find rows where 'direct_rad:W' is high but PV measurement is zero
    anomalous_rows = merged_data[(merged_data[feature] > threshold) & (merged_data['pv_measurement'] == 0)]
    
    
    # Display the anomalous rows
    return anomalous_rows
"""
merged_a1 = merged_a.copy().drop(get_anomals(merged_a,'clear_sky_rad:W',90).index)
merged_b1 = merged_b.copy().drop(get_anomals(merged_b,'direct_rad:W',90).index)
merged_c1 = merged_c.copy().drop(get_anomals(merged_c,'direct_rad_1h:J',90).index)
"""

### Add avg pv at this time over the past week or month.

In [None]:
resampled_df = merged_a.resample('7D', on='date_forecast',).mean()
resampled_df['pv_measurement']

In [None]:
def calculate_rolling_same_time_average(merged_df, period='7D'):
    
    df = merged_df.copy()
    # Resample the data at the desired frequency
    resampled_df = df.resample(period, on='date_forecast',).mean()
    
    
    # Reindex the resampled data to match the original index, filling missing values by interpolation
    return resampled_df

In [None]:
merged_a['weekly_avg_pv_hourly'] = calculate_rolling_same_time_average(merged_a, '7D')

# Calculate the rolling average at the same time over the past month
#merged_a['monthly_avg_same_time'] = calculate_rolling_same_time_average(merged_a, '30D')

In [None]:
merged_a

### Add direct_rad * sun_elevation feature

In [None]:
#Did not improve kaggle score
def add_rad_x_sun(merged_data):
    df = merged_data.copy()
    df['rad_x_sun_elevation'] = df['direct_rad:W']*df['sun_elevation:d']
    return df
"""
mod_a = add_rad_x_sun(merged_a)
mod_b = add_rad_x_sun(merged_b)
mod_c = add_rad_x_sun(merged_c)

x_test_a_mod = add_rad_x_sun(x_test_a)
x_test_b_mod = add_rad_x_sun(x_test_b)
x_test_c_mod = add_rad_x_sun(x_test_c)
"""

In [None]:
time_series_plot('rad_x_sun_elevation',df.reset_index())

### Categorical Feats

In [None]:
import pandas as pd

def convert_columns_to_cat(merged_data, cat_features):
    df = merged_data.copy()
    for col in cat_features:
        df[col] = df[col].astype(str)
    return df
cat_features=['estimated','dew_or_rime:idx','is_day:idx','is_in_shadow:idx','precip_type_5min:idx','snow_drift:idx']
cat_features1 = ['estimated']

In [None]:
merged_a_cat = convert_columns_to_cat(merged_a,cat_features1)
merged_b_cat = convert_columns_to_cat(merged_b,cat_features1)
merged_c_cat = convert_columns_to_cat(merged_c,cat_features1)

In [None]:
x_test_a_cat = convert_columns_to_cat(x_test_a,cat_features1)
x_test_b_cat = convert_columns_to_cat(x_test_b,cat_features1)
x_test_c_cat = convert_columns_to_cat(x_test_c,cat_features1)

## Build Catboost model 

In [23]:
import itertools
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

def split_dataset(train_data, val_size=0.1, val = False, estimated_column = 'estimated'):
    if val: 
        estimated_one = train_data[train_data[estimated_column] == 1]

        #Split the filtered dataset into two
        half_index = len(estimated_one) // 2
        validation_set = estimated_one[half_index:]

        # Combine the first half of observed_zero with the rest of the data where observed != 0
        training_set = pd.concat([train_data[train_data[estimated_column] == 0], estimated_one[:half_index]])
    else:
        split_index = int(train_data.shape[0] * (1 - val_size))
        training_set = train_data.iloc[:split_index]
        validation_set = train_data.iloc[split_index:]
    return training_set, validation_set


def find_best_categorical_combination(df, cat_features, target_col='pv_measurement', iterations=2000, learning_rate=0.1, depth=6, random_seed=42):
    best_mae = float('inf')
    best_combination = []
    best_model = None

    # Iterate over all combinations of categorical features
    for L in range(1, len(cat_features) + 1):
        for subset in itertools.combinations(cat_features, L):
            print(f"Testing combination: {subset}")
            
            # Convert the features in the subset to strings
            for col in subset:
                df[col] = df[col].astype(str)
            
            X = df.drop(columns=['pv_measurement'])
            y = df['pv_measurement']
            X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42)
            
            # Initialize and fit the CatBoost model
            catboost_model = CatBoostRegressor(
                cat_features=list(subset),
                iterations=iterations,
                learning_rate=learning_rate,
                depth=depth,
                loss_function='MAE',
                eval_metric='MAE',
                random_seed=random_seed,
                verbose=200
            )
            
            catboost_model.fit(X_train, y_train, eval_set=(X_validation, y_validation), use_best_model=True, early_stopping_rounds=200)
            
            # Evaluate the model
            predictions = catboost_model.predict(X_validation)
            mae = mean_absolute_error(y_validation, predictions)
            
            # Update the best combination if the current one is better
            if mae < best_mae:
                best_mae = mae
                best_combination = subset
                best_model = catboost_model

    return best_model, best_combination, best_mae


In [24]:
cat_features=['estimated','dew_or_rime:idx','is_day:idx','is_in_shadow:idx','precip_type_5min:idx','snow_drift:idx']

In [None]:
model_a, best_combination, best_mae = find_best_categorical_combination(merged_a,cat_features)

Testing combination: ('estimated',)
0:	learn: 592.3538894	test: 591.1188704	best: 591.1188704 (0)	total: 69ms	remaining: 2m 17s
200:	learn: 186.7626008	test: 182.9270753	best: 182.9270753 (200)	total: 2.32s	remaining: 20.8s
400:	learn: 173.6238514	test: 175.6059344	best: 175.6059344 (400)	total: 4.53s	remaining: 18.1s
600:	learn: 165.6572712	test: 172.5737280	best: 172.5724128 (599)	total: 6.67s	remaining: 15.5s
800:	learn: 159.7170395	test: 170.8376062	best: 170.7929389 (798)	total: 8.75s	remaining: 13.1s
1000:	learn: 153.8277129	test: 169.3620220	best: 169.3598594 (998)	total: 10.9s	remaining: 10.8s
1200:	learn: 149.6575256	test: 168.7140546	best: 168.7140546 (1200)	total: 12.9s	remaining: 8.59s
1400:	learn: 146.0052578	test: 168.0433364	best: 168.0238633 (1399)	total: 15s	remaining: 6.41s
1600:	learn: 142.1223233	test: 167.3706376	best: 167.3706376 (1600)	total: 17.1s	remaining: 4.25s
1800:	learn: 139.3374800	test: 166.8226123	best: 166.8226123 (1800)	total: 19.3s	remaining: 2.13s
1

In [19]:
def split_dataset(train_data, val_size=0.1, val = False, estimated_column = 'estimated'):
    if val: 
        estimated_one = train_data[train_data[estimated_column] == 1]

        #Split the filtered dataset into two
        half_index = len(estimated_one) // 2
        validation_set = estimated_one[half_index:]

        # Combine the first half of observed_zero with the rest of the data where observed != 0
        training_set = pd.concat([train_data[train_data[estimated_column] == 0], estimated_one[:half_index]])
    else:
        split_index = int(train_data.shape[0] * (1 - val_size))
        training_set = train_data.iloc[:split_index]
        validation_set = train_data.iloc[split_index:]
    return training_set, validation_set

def build_catboost(merged_df, val_size=0.1, randomized=False):
    merged_df = merged_df.drop(columns=['date_forecast','time'])
    if randomized:
        X = merged_df.drop(columns=['pv_measurement'])
        y = merged_df['pv_measurement']
        X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42)
    else:
        training_set, validation_set = split_dataset(merged_df, val_size, True)
        X_train = training_set.drop(columns=['pv_measurement'])
        y_train = training_set['pv_measurement']
        X_validation = validation_set.drop(columns=['pv_measurement'])
        y_validation = validation_set['pv_measurement']
    
    catboost_model = CatBoostRegressor(
        cat_features=['estimated'],
        iterations=2000,
        learning_rate=0.1,
        depth=6,
        loss_function='MAE',
        eval_metric='MAE',
        random_seed=42,
        verbose=200
    )
    
    catboost_model.fit(X_train, y_train, eval_set=(X_validation, y_validation), use_best_model=True, early_stopping_rounds=200)
    return catboost_model


In [None]:
model_a = build_catboost(merged_a_cat,0.2, True)
model_b = build_catboost(merged_b_cat,0.2, True)
model_c = build_catboost(merged_c_cat,0.125, True)

In [None]:
model_a = build_catboost(merged_a_avg,0.2, True)
model_b = build_catboost(merged_b_avg,0.2, True)
model_c = build_catboost(merged_c_avg,0.125, True)

In [None]:
laged_model_a = build_catboost(laged_a,0.2, True)
laged_model_b = build_catboost(laged_b,0.2, True)
laged_model_c = build_catboost(laged_c,0.125, True)

### Predict Lag

In [None]:
import pandas as pd

def predict_with_lag(model, test_data, initial_lag_value, lag_hours=1, column_name='pv_measurement'):
    """
    Predict using a model that requires a lag feature, updating the test set iteratively.

    Parameters:
    model (model object): The trained model used for prediction.
    test_data (pd.DataFrame): The test dataset without the target column.
    initial_lag_value (float): The last known value of the target variable from the training set.
    lag_hours (int): The number of hours to lag.
    column_name (str): The name of the target column.

    Returns:
    pd.Series: A series of predictions for the test dataset.
    """
    predictions = []
    lag_feature_name = f"{column_name}_lag_{lag_hours}h"
    current_lag_value = initial_lag_value
    
    for index, row in test_data.iterrows():
        # Set the current lag value
        row[lag_feature_name] = current_lag_value
        
        # Make a prediction
        prediction = model.predict(row.to_frame().transpose())[0]
        predictions.append(prediction)
        
        # Update the lag value with the current prediction
        current_lag_value = prediction
    
    return pd.Series(predictions, index=test_data.index)

initial_lag_val_a = merged_a.tail(24).iloc[0,52]
initial_lag_val_b = merged_b.tail(24).iloc[0,52]
initial_lag_val_c = merged_c.tail(24).iloc[0,52]

# Then, use the function to make predictions:
laged_pred_a = np.array(predict_with_lag(model=laged_model_a, test_data=x_test_a_laged, 
                               initial_lag_value=initial_lag_val_a, lag_hours=24))
laged_pred_b = np.array(predict_with_lag(model=laged_model_b, test_data=x_test_b_laged, 
                               initial_lag_value=initial_lag_val_a, lag_hours=24))
laged_pred_c = np.array(predict_with_lag(model=laged_model_c, test_data=x_test_c_laged, 
                               initial_lag_value=initial_lag_val_a, lag_hours=24))

### Predict and Submit model

In [None]:
pred_a = model_a.predict(x_test_a_avg)
pred_b = model_b.predict(x_test_b_avg)
pred_c = model_c.predict(x_test_c_avg)

In [None]:
pred_a = model_a.predict(x_test_a_cat)
pred_b = model_b.predict(x_test_b_cat)
pred_c = model_c.predict(x_test_c_cat)

In [None]:
def create_sub(pred_a,pred_b,pred_c):
    submission = pd.read_csv('sample_submission.csv')
    submission['prediction'] = np.concatenate([pred_a,pred_b,pred_c])
    submission.loc[submission['prediction'] < 0, 'prediction'] = 0
    return submission

sub = create_sub(pred_a,pred_b,pred_c)
#sub = create_sub(laged_pred_a,laged_pred_b,laged_pred_c)

In [None]:
sub.to_csv(f'Submissions/catgoricalVar.csv', index=False)

In [None]:
def save_model(model,model_name,location):
    save_directory = 'Saved_models/'+ location.upper()
    os.makedirs(save_directory, exist_ok=True)

    # Define the path to save the model
    model_file_path = os.path.join(save_directory, f'{model_name}.cbm')

    # Save the model
    model.save_model(model_file_path)

    print(f"Model successfully saved at {model_file_path}")
    
save_model(model_a,'base_catBoost','A')
save_model(model_b,'base_catBoost','B')
save_model(model_c,'base_catBoost','C')

### Model Evaluation

In [None]:
def get_feat_importance(model):
    feats = {'feature':merged_a.drop(columns =['date_forecast','time','pv_measurement']).columns,
         'importance':model.get_feature_importance()}
    df = pd.DataFrame(feats).sort_values('importance',ascending = False)
    return df

In [None]:
get_feat_importance(model_a)

In [None]:
def compare_two_preds(pred1,pred2):

    plt.figure(figsize=(10, 8))

    # Scatter plot
    plt.scatter(pred1['prediction'], pred2['prediction'], alpha=0.5)

    # Line of equality (for reference)
    plt.plot([pred1['prediction'].min(), pred1['prediction'].max()],
             [pred2['prediction'].min(), pred2['prediction'].max()],
             color='red', linestyle='--')

    # Labels and title
    plt.xlabel('Predictions from First Model')
    plt.ylabel('Predictions from New model')
    plt.title('Comparison of Predictions from Two Models')

    # Show plot
    plt.grid(True)
    plt.show()

In [None]:
compare_two_preds(sub,sub_lag)

In [None]:
def plot_prediction(preds):
    test = pd.read_csv('test.csv')
    predictions= preds['predict'].as_data_frame()
    predictions['time'] = test['time'].unique()
    fig, ax1 = plt.subplots(figsize=(15, 6))
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Prediction', color='tab:blue')
    ax1.plot(predictions['time'], predictions['predict'], color='tab:blue', label='Solar Power Production')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    fig.tight_layout()
    plt.title(f'Time Series Plot of prediction')
    plt.show()

### Post-Processing

In [None]:
df = pd.read_csv('merged_average2.csv')
df.loc[df['prediction'] < 8, 'prediction'] = 0

In [None]:
df.to_csv(f'Submissions/merged_models3.csv', index=False)

In [None]:
maks = max([train_a['pv_measurement'].max(),train_b['pv_measurement'].max(),train_c['pv_measurement'].max()])


In [None]:
  """# Plot the distribution of "direct_rad:W"
    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['direct_rad:W'], bins=50, kde=True)
    plt.title('Distribution of "direct_rad:W"')
    plt.xlabel('Direct Radiation (W)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['clear_sky_rad:W'], bins=50, kde=True)
    plt.title('Distribution of "clear_sky_rad:W"')
    plt.xlabel('Direct Radiation (W)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['direct_rad_1h:J'], bins=50, kde=True)
    plt.title('Distribution of "direct_rad_1h:J"')
    plt.xlabel('Radiation 1h(J)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['clear_sky_energy_1h:J'], bins=50, kde=True)
    plt.title('Distribution of "clear_sky_energy_1h:J"')
    plt.xlabel('Radiation 1h(J)')
    plt.ylabel('Frequency')
    plt.show()"""

    