In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from autogluon.tabular import TabularPredictor
from sklearn.model_selection._split import _BaseKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


## Import the datasets

In [2]:
train_a = pd.read_parquet('A/train_targets.parquet')
train_b = pd.read_parquet('B/train_targets.parquet')
train_c = pd.read_parquet('C/train_targets.parquet')

x_train_estimated_a = pd.read_parquet('A/X_train_estimated.parquet')
x_train_estimated_b = pd.read_parquet('B/X_train_estimated.parquet')
x_train_estimated_c = pd.read_parquet('C/X_train_estimated.parquet')

x_train_observed_a = pd.read_parquet('A/X_train_observed.parquet')
x_train_observed_b = pd.read_parquet('B/X_train_observed.parquet')
x_train_observed_c = pd.read_parquet('C/X_train_observed.parquet')

x_test_estimated_a = pd.read_parquet('A/X_test_estimated.parquet')
x_test_estimated_b = pd.read_parquet('B/X_test_estimated.parquet')
x_test_estimated_c = pd.read_parquet('C/X_test_estimated.parquet')

## Merge x_train observed and x_train_estimated

In [3]:
x_train_a = pd.concat([x_train_observed_a,x_train_estimated_a])
x_train_b = pd.concat([x_train_observed_b,x_train_estimated_b])
x_train_c = pd.concat([x_train_observed_c,x_train_estimated_c])

## Part 1: Data Preprocessing

1. Aggregate data from every 15 minutes to hourly intervals:
   - Method 1: Take the mean over all four 15minutes recording, resulting in hourly measurements.
   - Method 2: Create a separate column for each 15-minute value, effectively quadrupling the number of columns.
2. Handle missing values
   - Remove rows with NaN values in pv_measurement
   - Remove rows with timestamps that are not present in both x and y
3. 


### 1. Remove NaN rows and columns

In [4]:
#Remove snow:density column as well as rows with only NaN values
def remove_nan_cols(x,remove_cols):
    df = x.copy()
    df = df.drop(columns = remove_cols ) #Should we include 'cloud_base_agl:m' and ceiling_height_agl:m ['snow_density:kgm3']
    return df

In [5]:
#For Catboost
x_train_a1 = remove_nan_cols(x_train_a,['snow_density:kgm3'])
x_train_b1 = remove_nan_cols(x_train_b,['snow_density:kgm3'])
x_train_c1 = remove_nan_cols(x_train_c,['snow_density:kgm3'])

x_test_a1 = remove_nan_cols(x_test_estimated_a,['snow_density:kgm3'])
x_test_b1 = remove_nan_cols(x_test_estimated_b,['snow_density:kgm3'])
x_test_c1 = remove_nan_cols(x_test_estimated_c,['snow_density:kgm3'])

In [6]:
#For Autogluon
x_train_a2 =  remove_nan_cols(x_train_a,['snow_density:kgm3', 'cloud_base_agl:m'])
x_train_b2 =  remove_nan_cols(x_train_b,['snow_density:kgm3', 'cloud_base_agl:m'])
x_train_c2 =  remove_nan_cols(x_train_c,['snow_density:kgm3', 'cloud_base_agl:m'])

x_test_a2 =  remove_nan_cols(x_test_estimated_a,['snow_density:kgm3', 'cloud_base_agl:m'])
x_test_b2 =  remove_nan_cols(x_test_estimated_b,['snow_density:kgm3', 'cloud_base_agl:m'])
x_test_c2 =  remove_nan_cols(x_test_estimated_c,['snow_density:kgm3', 'cloud_base_agl:m'])

### 1. Transform data to hourly

**Method 1: Take the mean over all four 15minutes recording, resulting in hourly measurements.**

In [7]:
def resample_to_hourly(x): 
    df = x.copy()
    df.set_index('date_forecast', inplace=True)
    
    # Aggregating by averaging over quartarly measurements
    df_hourly = df.resample('H').mean(numeric_only=False)
    df_hourly.reset_index(inplace=True)
    return df_hourly

x_train_a_hourly = resample_to_hourly(x_train_a1)
x_train_b_hourly = resample_to_hourly(x_train_b1)
x_train_c_hourly = resample_to_hourly(x_train_c1)

x_test_a_hourly = resample_to_hourly(x_test_a1)
x_test_b_hourly = resample_to_hourly(x_test_b1)
x_test_c_hourly = resample_to_hourly(x_test_c1)

In [8]:
#use only rows in test that are given in the test csv
test = pd.read_csv('test.csv')
pred_time_stamps = test['time'].unique()
x_test_a1 = x_test_a_hourly[x_test_a_hourly['date_forecast'].isin(pred_time_stamps)]
x_test_b1 = x_test_b_hourly[x_test_b_hourly['date_forecast'].isin(pred_time_stamps)]
x_test_c1 = x_test_c_hourly[x_test_c_hourly['date_forecast'].isin(pred_time_stamps)]

**Method 2: Create a separate column for each 15-minute value, quadrupling the number of columns.**

In [9]:
def resample_to_hourly_quarters(x, date_column='date_forecast', exclude_column='date_calc'):
    df = x.copy()
    # Ensure the date column is in datetime format and set as the index
    df[date_column] = pd.to_datetime(df[date_column])
    df.set_index(date_column, inplace=True)
    
    # Separate the column to exclude from the resampling
    excluded_data = df[[exclude_column]].resample('H').first()  # You can use 'first' or 'last' here
    
    # Remove the excluded column from df before pivoting
    df = df.drop(columns=[exclude_column])

    # Add a column for the 15-minute period within the hour
    df['quarter'] = df.index.minute // 15  # Use floor division to get the quarter number (0, 1, 2, 3)

    # Pivot the table. For each feature, create a new column for each 15-minute period.
    df_pivot = df.pivot_table(index=df.index.floor('H'),
                              columns='quarter',
                              aggfunc='first')  # We use 'first' because each quarter should be unique
    
    # Flatten the multi-level column index
    df_pivot.columns = ['{}_Q{}'.format(feature, quarter) for feature, quarter in df_pivot.columns]

    # Reset the index to be able to merge on the date_column
    df_pivot.reset_index(inplace=True)
    excluded_data.reset_index(inplace=True)

    # Merge back the excluded column
    df_hourly = pd.merge(excluded_data, df_pivot, on=date_column)

    return df_hourly

# Make sure to pass the column name that contains the datetime information
x_train_a_hourly2 = resample_to_hourly_quarters(x_train_a2, date_column='date_forecast')
x_train_b_hourly2 = resample_to_hourly_quarters(x_train_b2, date_column='date_forecast')
x_train_c_hourly2 = resample_to_hourly_quarters(x_train_c2, date_column='date_forecast')

x_test_a_hourly2 = resample_to_hourly_quarters(x_test_a2, date_column='date_forecast')
x_test_b_hourly2 = resample_to_hourly_quarters(x_test_b2, date_column='date_forecast')
x_test_c_hourly2 = resample_to_hourly_quarters(x_test_b2, date_column='date_forecast')

## 2. Handle consecutive pv measurments

In [10]:
#Filters out rows from a DataFrame where the 'pv_measurement' column has consecutive identical values beyond a specified threshold.
def remove_constant_intervals(y_train, low_thresh, upp_thresh = 10**6):
    """
    Identify and remove intervals of constant PV readings that exceed a specified duration. 
    Constant readings may indicate sensor malfunctions or data logging issues.
    """
    
    df = y_train.copy()
    
    # Calculate the difference in production values
    df['diff'] = df['pv_measurement'].diff()

    # Identify where the difference is zero
    df['zero_diff'] = df['diff'].abs() < 1e-5

    # Identify groups of consecutive zero differences
    df['group'] = (df['zero_diff'] != df['zero_diff'].shift()).cumsum()

    # Filter out only the groups with consecutive zero differences
    constant_intervals = df[df['zero_diff']].groupby('group').agg(start=('time', 'min'), 
                                                                  end=('time', 'max'),
                                                                  duration=('time', 'size'))
    
    # Filter intervals based on the threshold
    interval_df_thresh = constant_intervals[(constant_intervals['duration'] > low_thresh) & (constant_intervals['duration'] <upp_thresh)]
    
    # Remove rows from the main dataframe that fall within these intervals
    for _, row in interval_df_thresh.iterrows():
        start_time, end_time = row['start'], row['end']
        df = df[(df['time'] < start_time) | (df['time'] > end_time)]
    
    # Drop the added columns used for calculations
    df.drop(columns=['diff', 'zero_diff', 'group'], inplace=True)
    
    return df

In [11]:
#Remove rows in groups of constant values, where duration of constant measurements is > 1 day (24 hours)
train_a = remove_constant_intervals(train_a,24)
train_b = remove_constant_intervals(train_b,24)
train_c = remove_constant_intervals(train_c,24)

## 3.Handle missing values and rows 
   - Remove NaN pv measurement values from train a/b/c
   - Remove rows that have timestamps that are not present in both x and y

In [12]:

# Identify the indices of the rows with NaN values in the 'pv_measurement' column
nan_indices_a = train_a[train_a['pv_measurement'].isna()].index
nan_indices_b = train_b[train_b['pv_measurement'].isna()].index
nan_indices_c = train_c[train_c['pv_measurement'].isna()].index

# Drop these indices from y_train
train_a = train_a.drop(nan_indices_a).reset_index(drop = True)
train_b = train_b.drop(nan_indices_b).reset_index(drop = True)
train_c = train_c.drop(nan_indices_c).reset_index(drop = True)



In [13]:
# Remove all rows with date-time values that are not present in both x and y in order to synchronize x and its labels. 
def remove_non_synchronous_rows(x_train, y_train, x_date_column='date_forecast', y_date_column='time'):
    # Convert date columns to datetime format for easier comparison
    x_train[x_date_column] = pd.to_datetime(x_train[x_date_column])
    y_train[y_date_column] = pd.to_datetime(y_train[y_date_column])
    
    # Find common dates
    common_dates = x_train[x_date_column][x_train[x_date_column].isin(y_train[y_date_column])]
    
    # Filter both datasets based on common dates
    x_train_synced = x_train.loc[x_train[x_date_column].isin(common_dates)]
    y_train_synced = y_train.loc[y_train[y_date_column].isin(common_dates)]
    
    return x_train_synced, y_train_synced

# Remove the rows with date and time that only shows up in one of the sets
x_train_a1, train_a = remove_non_synchronous_rows(x_train_a_hourly, train_a)
x_train_b1, train_b = remove_non_synchronous_rows(x_train_b_hourly, train_b)
x_train_c1, train_c = remove_non_synchronous_rows(x_train_c_hourly, train_c)

# Remove the rows with date and time that only shows up in one of the sets
x_train_a2, train_a = remove_non_synchronous_rows(x_train_a_hourly2, train_a)
x_train_b2, train_b = remove_non_synchronous_rows(x_train_b_hourly2, train_b)
x_train_c2, train_c = remove_non_synchronous_rows(x_train_c_hourly2, train_c)

In [14]:
#Remove 24 rows with all nan values for Catboost
x_train_a1 = x_train_a1.dropna(subset=['diffuse_rad:W'])
x_train_b1 = x_train_b1.dropna(subset=['diffuse_rad:W'])
x_train_c1 = x_train_c1.dropna(subset=['diffuse_rad:W'])

x_test_a1 = x_test_a1.dropna(subset=['diffuse_rad:W'])
x_test_b1 = x_test_b1.dropna(subset=['diffuse_rad:W'])
x_test_c1 = x_test_c1.dropna(subset=['diffuse_rad:W'])

In [15]:
x_train_a2 = x_train_a2.dropna(subset=['diffuse_rad:W_Q1'])
x_train_b2 = x_train_b2.dropna(subset=['diffuse_rad:W_Q1'])
x_train_c2 = x_train_c2.dropna(subset=['diffuse_rad:W_Q1'])

x_test_a2 = x_test_a_hourly2.dropna(subset=['diffuse_rad:W_Q1'])
x_test_b2 = x_test_b_hourly2.dropna(subset=['diffuse_rad:W_Q1'])
x_test_c2 = x_test_c_hourly2.dropna(subset=['diffuse_rad:W_Q1'])

### 3. Feature Engineering 
   - Add time features: hour, day, month, year
   - Add binary observed column
   - Add cyclical features
   - Add direct_rad x sun_elevation feature

In [16]:
# Extracts year, month, day, and hour features from a given datetime column
def extract_date_features(X):
    df = X.copy()
    # Extract features
    df['year'] = df['date_forecast'].dt.year
    df['month'] = df['date_forecast'].dt.month
    df['day'] = df['date_forecast'].dt.day
    df['hour'] = df['date_forecast'].dt.hour
    
    df['observed'] = (df['date_calc'].isna()).astype(int)
    df['observed'] = df['observed'].astype(str)
    
    
    df = df.drop(columns = ['date_calc'])
    
    return df

In [17]:
x_train_a1 = extract_date_features(x_train_a1)
x_train_b1 = extract_date_features(x_train_b1)
x_train_c1 = extract_date_features(x_train_c1)

x_test_a1 = extract_date_features(x_test_a1)
x_test_b1 = extract_date_features(x_test_b1)
x_test_c1 = extract_date_features(x_test_c1)


In [18]:
x_train_a2 = extract_date_features(x_train_a2)
x_train_b2 = extract_date_features(x_train_b2)
x_train_c2 = extract_date_features(x_train_c2)

x_test_a2 = extract_date_features(x_test_a2)
x_test_b2 = extract_date_features(x_test_b2)
x_test_c2 = extract_date_features(x_test_c2)

In [19]:
# Creating cyclical features for hour of the day and month of the year
def add_cyclic(x_train):
    train_data = x_train.copy()
   
    train_data['hour_sin'] = np.sin(2 * np.pi * train_data['hour'] / 24)
    train_data['hour_cos'] = np.cos(2 * np.pi * train_data['hour'] / 24)
    train_data['month_sin'] = np.sin(2 * np.pi * (train_data['month']-1) / 12)
    train_data['month_cos'] = np.cos(2 * np.pi * (train_data['month']-1) / 12)
    
    #train_data.drop(columns = ['hour','month'],inplace = True)
    return train_data

x_train_a1 = add_cyclic(x_train_a1)
x_train_b1 = add_cyclic(x_train_b1)
x_train_c1 = add_cyclic(x_train_c1)

x_test_a1 = add_cyclic(x_test_a1)
x_test_b1 = add_cyclic(x_test_b1)
x_test_c1 = add_cyclic(x_test_c1)

x_train_a2 = add_cyclic(x_train_a2)
x_train_b2 = add_cyclic(x_train_b2)
x_train_c2 = add_cyclic(x_train_c2)

x_test_a2 = add_cyclic(x_test_a2)
x_test_b2 = add_cyclic(x_test_b2)
x_test_c2 = add_cyclic(x_test_c2)

In [20]:
x_test_a1=x_test_a1.drop(columns = ['date_forecast'])
x_test_b1=x_test_b1.drop(columns = ['date_forecast'])
x_test_c1=x_test_c1.drop(columns = ['date_forecast'])

x_test_a2=x_test_a2.drop(columns = ['date_forecast'])
x_test_b2=x_test_b2.drop(columns = ['date_forecast'])
x_test_c2=x_test_c2.drop(columns = ['date_forecast'])

## Part 2: Model Building 

### 1. Catboost

In [None]:
#Merge x_train and train for training models
merged_a1 = pd.merge(x_train_a1, train_a, left_on='date_forecast', right_on='time', how='inner')
merged_b1 = pd.merge(x_train_b1, train_b, left_on='date_forecast', right_on='time', how='inner')
merged_c1 = pd.merge(x_train_c1, train_c, left_on='date_forecast', right_on='time', how='inner')

In [22]:
def build_catboost_multiple_seed(merged_df,x_test,number_of_models):
    merged_df = merged_df.drop(columns=['date_forecast', 'time'])
    X = merged_df.drop(columns=['pv_measurement'])
    y = merged_df['pv_measurement']
    
    predictions = []
    models = []
    scores = []
    seeds = range(number_of_models)
    
    for seed in seeds:
        X_train, X_validation, y_train, y_validation = train_test_split(
            X, y, train_size=0.8, random_state=seed)
        
        catboost_model = CatBoostRegressor(
            cat_features=['observed'],
            iterations=10000,
            learning_rate=0.1,
            depth=6,
            loss_function='MAE',
            eval_metric='MAE',
            random_seed=seed,
            verbose=1000
        )
        
        catboost_model.fit(X_train, y_train, eval_set=(X_validation, y_validation),
                           use_best_model=True, early_stopping_rounds=200)
        
        score = catboost_model.get_best_score()['validation']['MAE']
        scores.append(score)
        # Print the best validation MAE for the current seed
        print(f"Best validation MAE for seed {seed}: {score}")
        
        
        # Predict using the current model
        preds = catboost_model.predict(x_test)
        predictions.append(preds)
        models.append(catboost_model)
    
    # Average the predictions from all models
    averaged_predictions = np.mean(predictions, axis=0)
    average_score = np.mean(scores, axis = 0)
    
    return averaged_predictions,models, average_score

In [23]:
pred_a1, models_a, avg_a = build_catboost_multiple_seed(merged_a1,x_test_a1,10)
pred_b1, models_b, avg_b = build_catboost_multiple_seed(merged_b1,x_test_b1,10)
pred_c1, models_c, avg_c= build_catboost_multiple_seed(merged_c1,x_test_c1,10)

0:	learn: 583.7446175	test: 601.9980832	best: 601.9980832 (0)	total: 67.4ms	remaining: 11m 14s
200:	learn: 185.8090543	test: 194.9324312	best: 194.9324312 (200)	total: 2.24s	remaining: 1m 49s
400:	learn: 171.4713252	test: 187.4856701	best: 187.4856701 (400)	total: 4.42s	remaining: 1m 45s
600:	learn: 163.7304040	test: 184.3851517	best: 184.3850770 (599)	total: 6.52s	remaining: 1m 41s
800:	learn: 158.2052404	test: 182.3577627	best: 182.3577627 (800)	total: 8.58s	remaining: 1m 38s
1000:	learn: 153.5286275	test: 181.3651376	best: 181.3651376 (1000)	total: 10.6s	remaining: 1m 35s
1200:	learn: 149.8737580	test: 180.2530526	best: 180.2530526 (1200)	total: 12.6s	remaining: 1m 32s
1400:	learn: 145.7199321	test: 179.3874434	best: 179.3856318 (1385)	total: 14.7s	remaining: 1m 30s
1600:	learn: 142.6327839	test: 178.7861887	best: 178.7835656 (1584)	total: 16.7s	remaining: 1m 27s
1800:	learn: 139.9537399	test: 178.1622931	best: 178.1616605 (1793)	total: 18.7s	remaining: 1m 25s
2000:	learn: 137.49055

NameError: name 'x_test_b' is not defined

In [None]:
print(avg_a, avg_b, avg_c)

In [None]:
def create_sub(pred_a,pred_b,pred_c):
    submission = pd.read_csv('sample_submission.csv')
    submission['prediction'] = np.concatenate([pred_a,pred_b,pred_c])
    submission.loc[submission['prediction'] < 0, 'prediction'] = 0
    return submission

In [None]:
cat_pred = create_sub(pred_a1,pred_b1,pred_c1)

### 2.Autogluon

In [None]:
#Merge x_train and train for training models
merged_a2 = pd.merge(x_train_a2, train_a, left_on='date_forecast', right_on='time', how='inner')
merged_b2 = pd.merge(x_train_b2, train_b, left_on='date_forecast', right_on='time', how='inner')
merged_c2 = pd.merge(x_train_c2, train_c, left_on='date_forecast', right_on='time', how='inner')

In [None]:
seed_value = 42  # Replace with your desired seed value
random.seed(seed_value)
np.random.seed(seed_value)

In [None]:
def build_autogluon(merged_data, time_limit,location):
    merged_df = merged_data.drop(columns=['date_forecast', 'time'])
    
    predictor = TabularPredictor(
        label ='pv_measurement',
        eval_metric= 'mean_absolute_error',
        path = f'AutgluonModels/{location}'
    )

    predictor.fit(
        train_data = merged_df, 
        verbosity = 2,
        presets='best_quality', 
        time_limit= time_limit,
    )
    return predictor

In [None]:
model_a2 = build_autogluon(merged_a,1500,'A')
model_b2 = build_autogluon(merged_b,1500,'B')
model_c2 = build_autogluon(merged_c,1500,'C')

In [None]:
pred_a2 = model_a2.predict(x_test_a)
pred_b2 = model_b2.predict(x_test_b)
pred_c2 = model_c2.predict(x_test_c)

gluon_sub = create_sub(pred_a,pred_b,pred_c)

## Part 3: Blend predictions and submit

In [None]:
def weighted_avg(sub1,sub2, w1, w2):
    merged_df = pd.merge(sub1, sub2, on=['id'])
    merged_df['prediction'] = merged_df['prediction_x']*w1 + merged_df['prediction_y']*w2
    final_df = merged_df.drop(columns=['prediction_x', 'prediction_y'])
    final_df.loc[final_df['prediction'] < 8, 'prediction'] = 0
    return final_df

In [None]:
final_df = weighted_avg(sub1,sub2,0.6,0.4)
final_df.to_csv('Final_subs/firstShortNotebook.csv', index=False)