In [9]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy import stats
from scipy.stats import skew, kurtosis
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

In [None]:
input_file = r'/Users/pjam/Desktop/School/year4/sem1/thesis/Luzon/LUZ_Daily_Complete.csv'
data = pd.read_csv(input_file)
data = data.fillna(0)

In [61]:
def process_columns(df, value, substrings_ffill, substrings_interpolate):
    # Replace -999 with NaN
    df = df.replace(value, np.nan)
    
    # Forward fill for specified substrings
    ffill_cols = df.loc[:, df.columns.str.contains('|'.join(substrings_ffill), case=False)]
    ffill_cols = ffill_cols.ffill()
    
    # Interpolate for specified substrings
    interpolate_cols = df.loc[:, df.columns.str.contains('|'.join(substrings_interpolate), case=False)]
    interpolate_cols = interpolate_cols.interpolate(method='linear')
    
    return ffill_cols, interpolate_cols

columns_with_minus_999 = data.loc[:, (data == -999).any(axis=0)]

# Process columns
rainfall_cols, temp_cols_interpolated = process_columns(
    columns_with_minus_999, 
    -999, 
    substrings_ffill=['rainfall'], 
    substrings_interpolate=['tmax', 'tmin']
)
X = data.copy()
X[rainfall_cols.columns] = rainfall_cols
X[temp_cols_interpolated.columns] = temp_cols_interpolated
y = data[['GWAP','LWAP']]

In [62]:
train_size = int(0.6 * len(X))  # 60% for training
val_size = int(0.20 * len(X))   # 20% for validation
test_size = len(X) - train_size - val_size  # Remaining 15% for testing

train_data = X[:train_size]
train_labels = y[:train_size]

val_data = X[train_size:train_size + val_size]
val_labels = y[train_size:train_size + val_size]

test_data = X[train_size + val_size:]
test_labels = y[train_size + val_size:]

In [63]:
minmax_cols = []
boxcox_cols = []
yeojohnson_cols = []

# Classify columns into MinMax, BoxCox, or YeoJohnson families
def classify_features(data):
    for column in data.columns:
        col_data = data[column]
        skewness = col_data.skew()
        kurt = col_data.kurtosis()
        is_positive = np.all(col_data > 0)

        if -1 <= skewness <= 1 and -1 <= kurt <= 1:
            minmax_cols.append(column)  # MinMax family
        elif is_positive:
            boxcox_cols.append(column)  # BoxCox family
        else:
            yeojohnson_cols.append(column)  # YeoJohnson family

classify_features(data)

minmax_colsy = []
boxcox_colsy = []
yeojohnson_colsy = []

def classify_features(data):
    for column in data.columns:
        col_data = data[column]
        skewness = col_data.skew()
        kurt = col_data.kurtosis()
        is_positive = np.all(col_data > 0)

        if -1 <= skewness <= 1 and -1 <= kurt <= 1:
            minmax_colsy.append(column)  # MinMax family
        elif is_positive:
            boxcox_colsy.append(column)  # BoxCox family
        else:
            yeojohnson_colsy.append(column)  # YeoJohnson family

classify_features(y)


In [64]:
train_data_df = pd.DataFrame(train_data)
val_data_df = pd.DataFrame(val_data)  # Replace `data` with your actual data
test_data_df = pd.DataFrame(test_data)  

minmax_test = MinMaxScaler(feature_range=(0, 1))

minmaxfit = minmax_test.fit(train_data_df[minmax_cols])
train_data_minmax = minmaxfit.transform(train_data_df[minmax_cols])
joblib.dump(minmaxfit, 'minmax_scaler.pkl')
val_data_minmax = minmaxfit.transform(val_data_df[minmax_cols])
test_data_minmax = minmaxfit.transform(test_data_df[minmax_cols])

boxcox_pipeline = Pipeline([
    ('boxcox', PowerTransformer(method='box-cox', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
bc = boxcox_pipeline.fit(train_data_df[boxcox_cols])
train_data_bc = bc.transform(train_data_df[boxcox_cols])
joblib.dump(bc, 'boxcox_pipeline.pkl')
val_data_bc = bc.transform(val_data_df[boxcox_cols])
test_data_bc = bc.transform(test_data_df[boxcox_cols])

yeojohnson_pipeline = Pipeline([
    ('yeojohnson', PowerTransformer(method='yeo-johnson', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
yj = yeojohnson_pipeline.fit(train_data_df[yeojohnson_cols])
train_data_yj = yj.transform(train_data_df[yeojohnson_cols])
joblib.dump(yj, 'yeojohnson_pipeline.pkl')
val_data_yj = yj.transform(val_data_df[yeojohnson_cols])
test_data_yj = yj.transform(test_data_df[yeojohnson_cols])

In [65]:
train_data_transformed = np.hstack([train_data_minmax, train_data_bc, train_data_yj])
val_data_transformed = np.hstack([val_data_minmax, val_data_bc, val_data_yj])
test_data_transformed = np.hstack([test_data_minmax, test_data_bc, test_data_yj])



In [66]:
transformed_data = np.vstack([train_data_transformed, val_data_transformed, test_data_transformed])
transformed_data_df = pd.DataFrame(transformed_data, columns=minmax_cols + boxcox_cols + yeojohnson_cols)

transformed_data_df.to_csv('luz_transformed_data.csv', index=False)

In [67]:
train_labels_df = pd.DataFrame(train_labels)  # Replace `data` with your actual data
val_labels_df = pd.DataFrame(val_labels)  # Replace `data` with your actual data
test_labels_df = pd.DataFrame(test_labels)  # Replace `data` with your actual data

# Test Box-Cox + MinMaxScaler independently
boxcox_pipeline = Pipeline([
    ('boxcox', PowerTransformer(method='box-cox', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
bcy = boxcox_pipeline.fit(train_data_df[boxcox_colsy])
train_labels_bc = bcy.transform(train_data_df[boxcox_colsy])
val_labels_bc = bcy.transform(val_data_df[boxcox_colsy])
test_labels_bc = bcy.transform(test_data_df[boxcox_colsy])
joblib.dump(bcy, 'boxcox_pipeliney.pkl')
# Test Yeo-Johnson + MinMaxScaler independently
train_labels_transformed = np.hstack([train_labels_bc])
val_labels_transformed = np.hstack([ val_labels_bc])
test_labels_transformed = np.hstack([ test_labels_bc])

In [50]:
transformed_label= np.vstack([train_labels_transformed, val_labels_transformed, test_labels_transformed])
transformed_label_df = pd.DataFrame(transformed_label, columns=minmax_colsy + boxcox_colsy + yeojohnson_colsy)
display(transformed_label_df)

Unnamed: 0,GWAP,LWAP
0,0.000000,0.000457
1,0.005839,0.000000
2,0.208342,0.202951
3,0.222435,0.142095
4,0.299150,0.230127
...,...,...
725,0.528816,0.503483
726,0.511256,0.486112
727,0.524511,0.502653
728,0.514392,0.497723
