In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy import stats
from scipy.stats import skew, kurtosis
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

In [14]:
input_file = r'/Users/jojielyn/Desktop/School/04 Senior/Thesis/SARIMAX/Mindanao/MIN_Daily_Complete.csv'
data = pd.read_csv(input_file)
data = data.fillna(0)

In [21]:
def process_columns(df, value, substrings_ffill, substrings_interpolate):
    # Replace -999 with NaN
    df = df.replace(value, np.nan)
    
    # Forward fill for specified substrings
    ffill_cols = df.loc[:, df.columns.str.contains('|'.join(substrings_ffill), case=False)]
    ffill_cols = ffill_cols.ffill()
    
    # Interpolate for specified substrings
    interpolate_cols = df.loc[:, df.columns.str.contains('|'.join(substrings_interpolate), case=False)]
    interpolate_cols = interpolate_cols.interpolate(method='linear')
    
    return ffill_cols, interpolate_cols

columns_with_minus_999 = data.loc[:, (data == -999).any(axis=0)]

# Process columns
rainfall_cols, temp_cols_interpolated = process_columns(
    columns_with_minus_999, 
    -999, 
    substrings_ffill=['rainfall'], 
    substrings_interpolate=['tmax', 'tmin']
)
X = data.copy()
X[rainfall_cols.columns] = rainfall_cols
X[temp_cols_interpolated.columns] = temp_cols_interpolated
y = data[['GWAP','LWAP']]

In [22]:
train_size = int(0.8 * len(X))  # 80% for training
test_size = len(X) - train_size  # Remaining 20% for testing

train_data = X[:train_size]
train_labels = y[:train_size]

In [23]:
print(minmax_cols)
print(boxcox_cols)
print(yeojohnson_cols)

['FLOW_MIN', 'Hourly Demand', 'TMIN_Davao City', 'TMAX_Surigao', 'TMIN_Surigao', 'TMIN_Zamboanga', 'TMIN_Butuan', 'TMAX_Malaybalay', 'TMAX_General Santos']
['GWAP', 'LWAP', 'TMAX_Davao City', 'TMAX_Zamboanga', 'TMAX_Dipolog', 'TMIN_Dipolog', 'TMAX_Butuan', 'TMIN_Malaybalay', 'TMAX_Cotabato', 'TMIN_Cotabato']
['RESERVE_GWAP_Fr', 'RESERVE_GWAP_Ru', 'RESERVE_GWAP_Rd', 'RESERVE_GWAP_Dr', 'RAINFALL_Davao City', 'RAINFALL_Surigao', 'RAINFALL_Zamboanga', 'RAINFALL_Dipolog', 'RAINFALL_Butuan', 'RAINFALL_Malaybalay', 'RAINFALL_General Santos', 'TMIN_General Santos', 'RAINFALL_Cotabato']


In [24]:
minmax_cols = []
boxcox_cols = []
yeojohnson_cols = []

# Classify columns into MinMax, BoxCox, or YeoJohnson families
def classify_features(data):
    for column in data.columns:
        col_data = data[column]
        skewness = col_data.skew()
        kurt = col_data.kurtosis()
        is_positive = np.all(col_data > 0)

        if -1 <= skewness <= 1 and -1 <= kurt <= 1:
            minmax_cols.append(column)  # MinMax family
        elif is_positive:
            boxcox_cols.append(column)  # BoxCox family
        else:
            yeojohnson_cols.append(column)  # YeoJohnson family

classify_features(data)

minmax_colsy = []
boxcox_colsy = []
yeojohnson_colsy = []

def classify_features(data):
    for column in data.columns:
        col_data = data[column]
        skewness = col_data.skew()
        kurt = col_data.kurtosis()
        is_positive = np.all(col_data > 0)

        if -1 <= skewness <= 1 and -1 <= kurt <= 1:
            minmax_colsy.append(column)  # MinMax family
        elif is_positive:
            boxcox_colsy.append(column)  # BoxCox family
        else:
            yeojohnson_colsy.append(column)  # YeoJohnson family

classify_features(y)


In [25]:
train_data_df = pd.DataFrame(train_data)
minmax_test = MinMaxScaler(feature_range=(0, 1))
minmaxfit = minmax_test.fit(train_data_df[minmax_cols])
train_data_minmax = minmaxfit.transform(train_data_df[minmax_cols])
joblib.dump(minmaxfit, 'minmax_scaler.pkl')
boxcox_pipeline = Pipeline([
    ('boxcox', PowerTransformer(method='box-cox', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
bc = boxcox_pipeline.fit(train_data_df[boxcox_cols])
train_data_bc = bc.transform(train_data_df[boxcox_cols])
joblib.dump(bc, 'boxcox_pipeline.pkl')
yeojohnson_pipeline = Pipeline([
    ('yeojohnson', PowerTransformer(method='yeo-johnson', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
yj = yeojohnson_pipeline.fit(train_data_df[yeojohnson_cols])
train_data_yj = yj.transform(train_data_df[yeojohnson_cols])
joblib.dump(yj, 'yeojohnson_pipeline.pkl')
train_data_transformed = np.hstack([train_data_minmax, train_data_bc, train_data_yj])


In [27]:
train_labels_df = pd.DataFrame(train_labels)

boxcox_pipeline = Pipeline([
    ('boxcox', PowerTransformer(method='box-cox', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
bcy = boxcox_pipeline.fit(train_data_df[boxcox_colsy])
joblib.dump(bcy, 'boxcox_pipeliney.pkl')

['boxcox_pipeliney.pkl']

In [28]:
train_labels_transformed = bcy.transform(train_labels_df[boxcox_colsy])

In [29]:
train_labels_transformed = pd.DataFrame(train_labels_transformed)
train_data_transformed = pd.DataFrame(train_data_transformed)
train_labels_transformed.to_csv('train_labels_transformed.csv', index=False)
train_data_transformed.to_csv('train_data_transformed.csv', index=False)

In [30]:
test_data = X[train_size:]
test_labels = y[train_size:]

In [31]:

test_labels.to_csv('test_labels.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [32]:
train_data.to_csv('train_data.csv', index=False)

In [33]:
test_data = X[train_size:]

test_data_df = pd.DataFrame(test_data)
minmax_test = MinMaxScaler(feature_range=(0, 1))
minmaxfit = minmax_test.fit(test_data_df[minmax_cols])
test_data_minmax = minmaxfit.transform(test_data_df[minmax_cols])
joblib.dump(minmaxfit, 'minmax_scaler.pkl')
boxcox_pipeline = Pipeline([
    ('boxcox', PowerTransformer(method='box-cox', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
bc = boxcox_pipeline.fit(test_data_df[boxcox_cols])
test_data_bc = bc.transform(test_data_df[boxcox_cols])
joblib.dump(bc, 'boxcox_pipeline.pkl')
yeojohnson_pipeline = Pipeline([
    ('yeojohnson', PowerTransformer(method='yeo-johnson', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
yj = yeojohnson_pipeline.fit(test_data_df[yeojohnson_cols])
test_data_yj = yj.transform(test_data_df[yeojohnson_cols])
joblib.dump(yj, 'yeojohnson_pipeline.pkl')
test_data_transformed = np.hstack([test_data_minmax, test_data_bc, test_data_yj])

In [34]:
test_data_transformed = pd.DataFrame(test_data_transformed)
test_data_transformed.to_csv('test_data_transformed.csv', index=False)

In [35]:
data_df = pd.DataFrame(data)
minmax_test = MinMaxScaler(feature_range=(0, 1))
minmaxfit = minmax_test.fit(data_df[minmax_cols])
data_minmax = minmaxfit.transform(data_df[minmax_cols])
joblib.dump(minmaxfit, 'minmax_scaler.pkl')
boxcox_pipeline = Pipeline([
    ('boxcox', PowerTransformer(method='box-cox', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
bc = boxcox_pipeline.fit(data_df[boxcox_cols])
data_bc = bc.transform(data_df[boxcox_cols])
joblib.dump(bc, 'boxcox_pipeline.pkl')
yeojohnson_pipeline = Pipeline([
    ('yeojohnson', PowerTransformer(method='yeo-johnson', standardize=False)),
    ('minmax', MinMaxScaler(feature_range=(0, 1)))
])
yj = yeojohnson_pipeline.fit(data_df[yeojohnson_cols])
data_yj = yj.transform(data_df[yeojohnson_cols])
joblib.dump(yj, 'yeojohnson_pipeline.pkl')
data_transformed = np.hstack([data_minmax, data_bc, data_yj])

In [36]:
data_transformed = pd.DataFrame(data_transformed)
data_transformed.to_csv('data_transformed.csv', index=False)
                        