In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
%matplotlib inline

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [2]:
x_train_a = pd.read_csv('cleaned_data/A/x_train_a.csv')
x_train_b = pd.read_csv('cleaned_data/B/x_train_b.csv')
x_train_c = pd.read_csv('cleaned_data/C/x_train_c.csv')

x_test_a = pd.read_csv('cleaned_data/A/x_test_a.csv')
x_test_b = pd.read_csv('cleaned_data/B/x_test_b.csv')
x_test_c = pd.read_csv('cleaned_data/C/x_test_c.csv')

train_a = pd.read_csv('cleaned_data/A/train_a.csv')
train_b = pd.read_csv('cleaned_data/B/train_b.csv')
train_c = pd.read_csv('cleaned_data/C/train_c.csv')

In [3]:
train_a['time'] = pd.to_datetime(train_a['time'])
train_b['time'] = pd.to_datetime(train_b['time'])
train_c['time'] = pd.to_datetime(train_c['time'])

In [4]:
x_test_a = x_test_a.drop(columns = ['date_forecast'])
x_test_b = x_test_b.drop(columns = ['date_forecast'])
x_test_c = x_test_c.drop(columns = ['date_forecast'])


In [5]:
#Remove rows in X_train that has timestamp that does not exist in train_loc, and visa_verca
#e.g missing solar power measurements from 2022-10-21 01:00 - 2022-10-28 21:00
def align_X_y(x_train,y_train, x_date_column='date_forecast', y_date_column='time'):
    """
    Aligns two dataframes based on the 'date_forecast' column of X and the 'time' column of y,
    ensuring that only rows with matching time values are retained.

    Parameters:
    - X (pd.DataFrame): The first dataframe with time in the 'date_forecast'
    - y (pd.DataFrame): The second dataframe with time in the 'time' column.

    Returns:
    - tuple: A tuple containing the aligned dataframes.
    """
    # Convert date columns to datetime format for easier comparison
    x_train[x_date_column] = pd.to_datetime(x_train[x_date_column])
    y_train[y_date_column] = pd.to_datetime(y_train[y_date_column])
    
    # Find common dates
    common_dates = x_train[x_date_column][x_train[x_date_column].isin(y_train[y_date_column])]
    
    # Filter both datasets based on common dates
    x_train_synced = x_train[x_train[x_date_column].isin(common_dates)]
    y_train_synced = y_train[y_train[y_date_column].isin(common_dates)]
    
    return x_train_synced, y_train_synced


# Analysis of Target variable  - Looking at PV_measurement
1. Handle constant measurments over longer periods of time. Likely caused by sensor malfunction, data logging issues, or other external factors.
    - Handeled by removing all constant values lasting more than 24 hours 
2. Add cyclical features 
2. Handle longer periods of missing data:
    - Remove (currently tested)
    - Interpolate 
    - Copy from previous year
    - Copy solar production at missing time from another location

### 1. Handle constant PV measurements 

In [6]:
# Time-Series plot of PV_measurement 

def solar_prod_plot(y_train, resolution='year', chunks=5):
    df = y_train.copy()
    
    # Determine the plotting resolution based on the 'resolution' argument
    # Chunks = number of year/months/days in each plot
    if resolution == 'year':
        unique_values = df['time'].dt.year.unique()
        label = 'Year'
    elif resolution == 'month':
        df['year_month'] = df['time'].dt.to_period('M')
        unique_values = df['year_month'].unique()
        label = 'Month'
    elif resolution == 'week':
        df['year_week'] = df['time'].dt.to_period('W')
        unique_values = df['year_week'].unique()
        label = 'Week'
    elif resolution == 'day':
        df['date'] = df['time'].dt.date
        unique_values = df['date'].unique()
        label = 'Day'
    else:
        raise ValueError("Invalid resolution. Choose from 'year', 'month', 'week', or 'day'.")
    
    # Loop over the unique values in chunks
    for i in range(0, len(unique_values), chunks):
        subset_values = unique_values[i:i+chunks]
        
        if resolution == 'year':
            subset_df = df[df['time'].dt.year.isin(subset_values)]
        elif resolution == 'month':
            subset_df = df[df['year_month'].isin(subset_values)]
        elif resolution == 'week':
            subset_df = df[df['year_week'].isin(subset_values)]
        elif resolution == 'day':
            subset_df = df[df['date'].isin(subset_values)]
        
        plt.figure(figsize=(15, 6))
        plt.plot(subset_df['time'], subset_df['pv_measurement'])

        title = f"Solar Power Production for {label}: {subset_values[0]}"
        if len(subset_values) > 1:
            title += f" to {subset_values[-1]}"

        plt.title(title)
        plt.xlabel("Time")
        plt.ylabel("PV Measurement")
        plt.show()

def remove_constant_intervals(y_train, low_thresh, upp_thresh):
    """
    Identify and remove intervals of constant PV readings that exceed a specified duration. 
    Constant readings may indicate sensor malfunctions or data logging issues.
    
    Parameters:
    ----------
    y_train : pd.DataFrame
        Dataframe containing the time-series data of solar power production.
    threshold : int
        The minimum duration required for an interval to be considered for removal.
        
    Returns:
    -------
    pd.DataFrame
        The input dataframe with intervals of constant readings (exceeding the duration threshold) removed.
    """
    df = y_train.copy()
    
    # Calculate the difference in production values
    df['diff'] = df['pv_measurement'].diff()

    # Identify where the difference is zero
    df['zero_diff'] = df['diff'].abs() < 1e-5

    # Identify groups of consecutive zero differences
    df['group'] = (df['zero_diff'] != df['zero_diff'].shift()).cumsum()

    # Filter out only the groups with consecutive zero differences
    constant_intervals = df[df['zero_diff']].groupby('group').agg(start=('time', 'min'), 
                                                                  end=('time', 'max'),
                                                                  duration=('time', 'size'))
    
    # Filter intervals based on the threshold
    interval_df_thresh = constant_intervals[(constant_intervals['duration'] > low_thresh) & (constant_intervals['duration'] <upp_thresh)]
    
    # Remove rows from the main dataframe that fall within these intervals
    for _, row in interval_df_thresh.iterrows():
        start_time, end_time = row['start'], row['end']
        df = df[(df['time'] < start_time) | (df['time'] > end_time)]
    
    # Drop the added columns used for calculations
    df.drop(columns=['diff', 'zero_diff', 'group'], inplace=True)
    
    return df, constant_intervals


def get_time_interval(df, start_time = '2020-08-01 00:00:00', end_time = '2021-01-01 00:00:00'):
    # Filter rows based on the time period
    filtered_df = df[(df['time'] >= start_time) & (df['time'] <= end_time)]
    return filtered_df

In [7]:
#Removed all constant values with duration > 24 hours

train_a, const_interval_a = remove_constant_intervals(train_a,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_a, train_a = align_X_y(x_train_a, train_a)

In [8]:
rows_removed_a = np.sum(const_interval_a[const_interval_a['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed_a}')
const_interval_a[const_interval_a['duration']>24]

total number of rows removed 42


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
434,2020-01-04 15:00:00,2020-01-06 08:00:00,42


In [9]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_b, const_interval_b = remove_constant_intervals(train_b,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_b, train_b = align_X_y(x_train_b, train_b)

In [10]:
rows_removed = np.sum(const_interval_b[const_interval_b['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed}')
const_interval_b[const_interval_b['duration']>24]

total number of rows removed 6865


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32,2019-01-14 15:00:00,2019-01-18 11:00:00,93
36,2019-01-19 13:00:00,2019-01-26 08:00:00,164
40,2019-01-27 11:00:00,2019-01-28 13:00:00,27
74,2019-02-10 16:00:00,2019-02-13 07:00:00,64
160,2019-03-23 18:00:00,2019-03-26 06:00:00,61
302,2019-05-31 08:00:00,2019-06-03 12:00:00,77
606,2019-10-28 14:00:00,2019-10-30 22:00:00,57
674,2019-12-01 13:00:00,2019-12-04 08:00:00,68
682,2019-12-07 14:00:00,2019-12-11 08:00:00,91
700,2019-12-18 14:00:00,2019-12-20 09:00:00,44


In [11]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_c, const_interval_c = remove_constant_intervals(train_c,24,10**6)

#update X_train_a by removing coresponding rows that have been filtered here
x_train_c, train_c = align_X_y(x_train_c, train_c)

In [12]:
rows_removed = np.sum(const_interval_c[const_interval_c['duration']>24]['duration'])
print(f'total number of rows removed {rows_removed}')
const_interval_c[const_interval_c['duration']>24]

total number of rows removed 4926


Unnamed: 0_level_0,start,end,duration
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,2019-09-04 10:00:00,2019-09-05 12:00:00,27
180,2019-11-11 12:00:00,2019-11-13 08:00:00,45
230,2019-11-28 15:00:00,2019-12-05 09:00:00,163
240,2019-12-07 14:00:00,2019-12-13 09:00:00,140
256,2019-12-16 14:00:00,2019-12-21 09:00:00,116
276,2019-12-25 13:00:00,2019-12-30 09:00:00,117
290,2020-01-02 14:00:00,2020-01-07 09:00:00,116
340,2020-01-23 15:00:00,2020-01-26 08:00:00,66
376,2020-02-05 14:00:00,2020-02-10 07:00:00,114
414,2020-02-23 17:00:00,2020-03-08 08:00:00,328


### Add Cyclical Features

In [None]:
# Creating cyclical features for hour of the day
def cyclic_hourly(x):
    train_data = x.copy()
    train_data['hour_sin'] = np.sin(2 * np.pi * train_data['hour'] / 24)
    train_data['hour_cos'] = np.cos(2 * np.pi * train_data['hour'] / 24)
    return train_data


# Creating cyclical features for month of the year
def cyclic_monthly(x):
    train_data = x.copy()
    train_data['month_sin'] = np.sin(2 * np.pi * train_data['month'] / 12)
    train_data['month_cos'] = np.cos(2 * np.pi * train_data['month'] / 12)
    return train_data

In [None]:

x_train_a = cyclic_hourly(x_train_a)
x_train_a = cyclic_monthly(x_train_a)

x_test_a = cyclic_hourly(x_test_a)
x_test_a = cyclic_monthly(x_test_a)

x_train_b = cyclic_hourly(x_train_b)
x_train_b = cyclic_monthly(x_train_b)

x_test_b = cyclic_hourly(x_test_b)
x_test_b = cyclic_monthly(x_test_b)

x_train_c = cyclic_hourly(x_train_c)
x_train_c = cyclic_monthly(x_train_c)

x_test_c = cyclic_hourly(x_test_c)
x_test_c = cyclic_monthly(x_test_c)


### Remove outliers during night

In [None]:
def plot_hourly_avg(y_train):
    # Grouping by hour and calculating the average PV measurement for each hour
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour
    hourly_avg = train_data.groupby('hour')['pv_measurement'].mean()

    # Plotting the average PV production for each hour
    plt.figure(figsize=(12, 6))
    hourly_avg.plot(kind='bar', color='skyblue')
    plt.title('Average PV Production by Hour')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Average PV Production')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

def plot_dist_hour(y_train, hour):
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour
    
    # Filtering the data for the given hour
    hour_data = train_data[train_data['hour'] == hour]
    
    # Plotting the distribution of PV measurements for 1 am
    plt.figure(figsize=(12, 6))
    plt.hist(hour_data['pv_measurement'], bins=50, color='teal', alpha=0.7)
    plt.title(f'Distribution of PV Measurements at {hour}')
    plt.xlabel('PV Measurement')
    plt.ylabel('Frequency')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    print(hour_data['pv_measurement'].value_counts())
#train_c[(train_c['time'].dt.hour == 2) &(train_c['pv_measurement'] == 9.8)]

def get_nighttime_stats(y_train,night_start,night_end):
    train_data = y_train.copy()
    train_data['hour'] = y_train['time'].dt.hour

    # Filtering the data for nighttime hours (8 pm to 4 am)
    nighttime_data = train_data[(train_data['hour'] >= night_start) | (train_data['hour'] <= night_end)]

    # Descriptive statistics for nighttime PV measurements
    nighttime_stats = nighttime_data['pv_measurement'].describe()

    # Plotting the distribution of nighttime PV measurements
    plt.figure(figsize=(12, 6))
    plt.hist(nighttime_data['pv_measurement'], bins=50, color='purple', alpha=0.7)
    plt.axvline(nighttime_stats['75%'], color='red', linestyle='dashed', label='75th Percentile')
    plt.axvline(nighttime_stats['max'], color='green', linestyle='dashed', label='Max Value')
    plt.title('Distribution of Nighttime PV Measurements')
    plt.xlabel('PV Measurement')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    print(nighttime_stats)
    
def set_nighttime_to_zero(y_train, night_start,night_end, thresh):
    df = y_train.copy()
    df['hour'] = y_train['time'].dt.hour
    mask = (df['hour'] >= 23) | (df['hour'] <= 3) & (df['pv_measurement'] > thresh)
    df.loc[mask, 'pv_measurement'] = 0
    df = df.drop(columns = ['hour'])
    return df

#train_a[(train_a['time'].dt.hour == 2) &(train_a['pv_measurement'] >0)]
#train_a = set_nighttime_to_zero(train_a,23,3,0)
#train_b = set_nighttime_to_zero(train_b,23,3,0)
#train_c = set_nighttime_to_zero(train_c,23,3,0)
#train_a[(train_a['time'].dt.hour == 2) &(train_a['pv_measurement'] >0)]

In [13]:
merged_a = pd.merge(x_train_a, train_a, left_on='date_forecast', right_on='time', how='inner')
merged_b = pd.merge(x_train_b, train_b, left_on='date_forecast', right_on='time', how='inner')
merged_c = pd.merge(x_train_c, train_c, left_on='date_forecast', right_on='time', how='inner')

In [None]:
def get_

In [14]:

def anomals(merged_data,percentile):
    """# Plot the distribution of "direct_rad:W"
    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['direct_rad:W'], bins=50, kde=True)
    plt.title('Distribution of "direct_rad:W"')
    plt.xlabel('Direct Radiation (W)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['clear_sky_rad:W'], bins=50, kde=True)
    plt.title('Distribution of "clear_sky_rad:W"')
    plt.xlabel('Direct Radiation (W)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['direct_rad_1h:J'], bins=50, kde=True)
    plt.title('Distribution of "direct_rad_1h:J"')
    plt.xlabel('Radiation 1h(J)')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(12, 6))
    sns.histplot(merged_data['clear_sky_energy_1h:J'], bins=50, kde=True)
    plt.title('Distribution of "clear_sky_energy_1h:J"')
    plt.xlabel('Radiation 1h(J)')
    plt.ylabel('Frequency')
    plt.show()"""

    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['clear_sky_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad:W'].fillna(0, inplace=True)
    merged_data['direct_rad_1h:J'].fillna(0, inplace=True)

    # Calculate and display percentiles
    percentiles = [50,60,70,80,85,90,95]
    percentile_values_direct_rad= np.percentile(merged_data['direct_rad:W'], percentiles)
    percentile_values_direct_rad_1h = np.percentile(merged_data['direct_rad_1h:J'], percentiles)
    percentile_values_clear_sky_rad = np.percentile(merged_data['clear_sky_rad:W'], percentiles)
    percentile_values_clear_sky_energy = np.percentile(merged_data['clear_sky_energy_1h:J'], percentiles)
    percentile_values_df = pd.DataFrame({
        'Percentile': percentiles,
        'direct_rad:W':percentile_values_direct_rad,
        'direct_rad_1h:J': percentile_values_direct_rad_1h,
        'clear_sky_rad:W': percentile_values_clear_sky_rad,
        'clear_sky_energy_1h:J': percentile_values_clear_sky_energy
        })
    
    print(percentile_values_df)

    thresholds = {
    'direct_rad:W': percentile_values_df[percentile_values_df['Percentile']==percentile]['direct_rad:W'].values[0],
    'clear_sky_rad:W': percentile_values_df[percentile_values_df['Percentile']==percentile]['clear_sky_rad:W'].values[0],
    'direct_rad_1h:J': percentile_values_df[percentile_values_df['Percentile']==percentile]['direct_rad_1h:J'].values[0],
    'clear_sky_energy_1h:J': percentile_values_df[percentile_values_df['Percentile']==percentile]['clear_sky_energy_1h:J'].values[0]
    }

    # Find rows where all three variables are above their respective thresholds, but the PV measurement is zero
    anomalous_rows_all = merged_data[
        ((merged_data['direct_rad:W'] > thresholds['direct_rad:W']) |
        (merged_data['clear_sky_rad:W'] > thresholds['clear_sky_rad:W']) |
        (merged_data['direct_rad_1h:J'] > thresholds['direct_rad_1h:J'])) &
        (merged_data['pv_measurement'] == 0)
    ]
    merged = merged_data.copy().drop(anomalous_rows_all.index)
    # Display the anomalous rows
    return merged

ready_a= anomals(merged_a,95)
ready_b= anomals(merged_b,95)
ready_c= anomals(merged_c,95)

   Percentile  direct_rad:W  direct_rad_1h:J  clear_sky_rad:W  \
0          50        0.0000           515.60           3.5500   
1          60        1.6000         40750.16          47.0800   
2          70       15.2350        252837.66         167.1850   
3          80       65.8000        977565.78         344.6800   
4          85      120.3175       1731214.44         450.8175   
5          90      203.9650       2893228.80         570.6650   
6          95      334.6200       4739360.35         697.2575   

   clear_sky_energy_1h:J  
0               87470.60  
1              701348.70  
2             2331257.30  
3             5067434.40  
4             6636714.95  
5             8329340.60  
6            10158741.50  
   Percentile  direct_rad:W  direct_rad_1h:J  clear_sky_rad:W  \
0          50      0.000000          469.050           2.6500   
1          60      1.600000        39915.760          41.7250   
2          70     15.825001       261525.530         144.3000   
3  

In [None]:
def anomals1(merged_data,solar_radiation_threshold): 
    #identify the rows where the "direct_rad:W" column in x_train_a is high
    #but the PV measurement in train_a is zero -> Indicates wrong

    # Define a threshold for high solar radiation

    # Find rows where 'direct_rad:W' is high but PV measurement is zero
    anomalous_rows = merged_data[(merged_data['direct_rad:W'] > solar_radiation_threshold) & (merged_data['pv_measurement'] == 0)]
    
    
    # Display the anomalous rows
    return anomalous_rows

len(anomals1(merged_b,345))


In [None]:
ready_a1 = merged_a.copy().drop(anomals1(merged_a,334).index)
ready_b1 = merged_b.copy().drop(anomals1(merged_b,345).index)
ready_c1 = merged_c.copy().drop(anomals1(merged_c,339).index)

# Build Catboost model 

In [15]:
def build_catboost(merged_df):
    # Merge the datasets on the timestamp
    merged_df.drop(columns=['time', 'date_forecast'], inplace=True)

    # Separate features and target variable
    X = merged_df.drop(columns=['pv_measurement'])
    y = merged_df['pv_measurement']

    # Split the data into training and validation sets
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.8, random_state=42)

    # Initialize the CatBoost regressor
    catboost_model = CatBoostRegressor(
        iterations=1000, 
        learning_rate=0.1, 
        depth=6, 
        loss_function='MAE', 
        eval_metric='MAE', 
        random_seed=42,
        verbose=200
    )

    # Fit the model
    catboost_model.fit(X_train, y_train, eval_set=(X_validation, y_validation), use_best_model=True)
    return catboost_model

In [16]:
model_a = build_catboost(ready_a)
model_b = build_catboost(ready_b)
model_c = build_catboost(ready_c)

0:	learn: 600.2367172	test: 570.4899278	best: 570.4899278 (0)	total: 63.5ms	remaining: 1m 3s
200:	learn: 185.0709292	test: 183.3104343	best: 183.3104343 (200)	total: 2.13s	remaining: 8.46s
400:	learn: 169.6563925	test: 174.5714369	best: 174.5699932 (399)	total: 4.13s	remaining: 6.16s
600:	learn: 161.6542036	test: 170.9663541	best: 170.9486287 (599)	total: 6.05s	remaining: 4.01s
800:	learn: 155.4821487	test: 168.6165435	best: 168.6165435 (800)	total: 8.28s	remaining: 2.06s
999:	learn: 151.1582250	test: 167.4239106	best: 167.4239106 (999)	total: 10.2s	remaining: 0us

bestTest = 167.4239106
bestIteration = 999

0:	learn: 101.0242985	test: 101.1390660	best: 101.1390660 (0)	total: 12.1ms	remaining: 12.1s
200:	learn: 24.0864663	test: 28.2220523	best: 28.2220523 (200)	total: 1.79s	remaining: 7.12s
400:	learn: 22.3952769	test: 27.4820479	best: 27.4817351 (399)	total: 3.53s	remaining: 5.28s
600:	learn: 21.5732680	test: 27.2303217	best: 27.2294678 (587)	total: 5.23s	remaining: 3.47s
800:	learn: 

In [None]:
def get_feat_importance(model,merged):
    feats = {'feature':merged.drop(columns =['date_forecast','time','pv_measurement']).columns,
         'importance':model.get_feature_importance()}
    print(feats)
    df = pd.DataFrame(feats).sort_values('importance',ascending = False)
    return df

In [None]:
get_feat_importance(model_b,merged_b)

In [17]:
pred_a = model_a.predict(x_test_a)
pred_b = model_b.predict(x_test_b)
pred_c = model_c.predict(x_test_c)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="2023-05-01 00:00:00": Cannot convert 'b'2023-05-01 00:00:00'' to float

In [None]:
def create_sub(pred_a,pred_b,pred_c):
    submission = pd.read_csv('sample_submission.csv')
    submission['prediction'] = np.concatenate([pred_a,pred_b,pred_c])
    submission.loc[submission['prediction'] < 0, 'prediction'] = 0
    return submission


In [None]:
sub = create_sub(pred_a,pred_b,pred_c)
sub.to_csv(f'Submissions/secondCatboost.csv', index=False)

In [None]:



def build_model(X_train,y_train,location):
    
    merged_data = X_train.copy()
    merged_data['pv_measurement'] = y_train['pv_measurement'].values
    
    y = 'pv_measurement'
    x = list(X_train.columns)
    train = h2o.H2OFrame(merged_data)
    
    aml = H2OAutoML(
        max_models = 10,
        max_runtime_secs = 60,
        exclude_algos =['DeepLearning'],
        seed = 1,
        # stopping_metric ='logloss',
        sort_metric ='mae',
        balance_classes = False,
        project_name = location
    )

    aml.train(x=x, y=y, training_frame=train)
    
    lb = aml.leaderboard
    leader = aml.leader
    print(lb.head(rows=lb.nrows))
    
    h2o.save_model(leader, path=f'Saved_models/{location.upper()}', force = True)

    return lb,leader

In [None]:
lb_a,model_a = build_model(x_train_a,train_a,'A')
lb_b,model_b = build_model(x_train_b,train_b,'B')
lb_c,model_c = build_model(x_train_c,train_c,'C')

In [None]:
preds_a = model_a.predict(h2o.H2OFrame(x_test_a))
preds_b = model_b.predict(h2o.H2OFrame(x_test_b))
preds_c = model_c.predict(h2o.H2OFrame(x_test_c))

In [None]:
model_a_first = h2o.load_model('Saved_models/A/GBM_4_AutoML_1_20231020_170352')
model_b_first = h2o.load_model('Saved_models/B/GBM_8_AutoML_2_20231020_170654')
model_c_first = h2o.load_model('Saved_models/C/GBM_8_AutoML_2_20231020_170654')

In [None]:
preds_a_original = model_a_first.predict(h2o.H2OFrame(X_test_a))
preds_b_original = model_b_first.predict(h2o.H2OFrame(X_test_b))
preds_c_original = model_c_first.predict(h2o.H2OFrame(X_test_c))

In [None]:
def compare_two_preds(pred1,pred2):
    pred1 = preds_a_original.as_data_frame()
    pred2 = preds_a2.as_data_frame()

    plt.figure(figsize=(10, 8))

    # Scatter plot
    plt.scatter(y_pred1['predict'], y_pred2['predict'], alpha=0.5)

    # Line of equality (for reference)
    plt.plot([y_pred1['predict'].min(), y_pred1['predict'].max()],
             [y_pred2['predict'].min(), y_pred2['predict'].max()],
             color='red', linestyle='--')

    # Labels and title
    plt.xlabel('Predictions from First Model')
    plt.ylabel('Predictions from New model')
    plt.title('Comparison of Predictions from Two Models')

    # Show plot
    plt.grid(True)
    plt.show()

In [None]:
compare_two_preds(preds_a,preds_a_original)

In [None]:
# 2. Feature Importance
def feat_importance(model, n_feats):
    feature_importance = model.varimp(use_pandas=True)
    n_top_feats = feature_importance.iloc[:n_feats,:]
    return n_top_feats

print(feat_importance(model_a, 15))

In [None]:
def plot_prediction(preds):
    test = pd.read_csv('test.csv')
    predictions= preds['predict'].as_data_frame()
    predictions['time'] = test['time'].unique()
    fig, ax1 = plt.subplots(figsize=(15, 6))
    ax1.set_xlabel('Time')
    ax1.set_ylabel('Prediction', color='tab:blue')
    ax1.plot(predictions['time'], predictions['predict'], color='tab:blue', label='Solar Power Production')
    ax1.tick_params(axis='y', labelcolor='tab:blue')

    fig.tight_layout()
    plt.title(f'Time Series Plot of prediction')
    plt.show()

In [None]:
plot_prediction(preds_a)

In [None]:
plot_prediction(preds_c_original)

In [None]:
def create_sub(preds_a,preds_b,preds_c):
    submission = pd.read_csv('sample_submission.csv')
    predictions = preds_a.rbind(preds_b).rbind(preds_c)
    predictions_df = predictions['predict'].as_data_frame()
    predictions_df.loc[predictions_df['predict'] < 0, 'predict'] = 0
    submission['prediction'] = predictions_df['predict']
    return submission 


In [None]:
sub1 = create_sub(preds_a,preds_b,preds_c)


In [None]:
sub1.to_csv(f'Submissions/removedOutliersNight.csv', index=False)