In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import power_transform, StandardScaler
from sklearn.mixture import GaussianMixture

In [2]:
df = pd.read_parquet('../../data_parquet/train_data.parquet')
df_copy = df.copy()

**Procesamos las columnas con mayor significancia (Baseline) en base al precio**

In [3]:
df_copy = df_copy[['generation_fossil_gas','generation_fossil_hard_coal','total_load_actual',
                   'generation_nuclear','generation_hydro_run_of_river_and_poundage',
                   'generation_other_renewable','generation_waste','generation_fossil_oil',
                   'generation_other','generation_hydro_water_reservoir','generation_biomass',
                   'generation_solar','pressure','generation_wind_onshore','generation_hydro_pumped_storage_consumption',
                   'generation_fossil_brown_coal_lignite','temp_min','wind_speed','temp','temp_max',
                   'price_actual', 'time_hourly']]

In [4]:
def lag_price_date(df_copy):
    df_copy['lag_1'] = df_copy['price_actual'].shift(1)
    df_copy['lag_2'] = df_copy['price_actual'].shift(2)
    df_copy['lag_3'] = df_copy['price_actual'].shift(3)
    df_copy.dropna(inplace=True)
    return df_copy

In [5]:
df_copy = lag_price_date(df_copy)

In [8]:
def diff_price_date(df_copy):
    df_copy['diff_1'] = df_copy['price_actual'].diff(1)
    df_copy['diff_2'] = df_copy['price_actual'].diff(2)
    df_copy.dropna(inplace=True)
    return df_copy

In [9]:
df_copy = diff_price_date(df_copy)

In [11]:
def rolling_mean(df_copy):
    df_copy['rolling_mean_3'] = df_copy['price_actual'].rolling(window=3).mean()
    df_copy['rolling_mean_7'] = df_copy['price_actual'].rolling(window=7).mean()
    df_copy.dropna(inplace=True)
    return df_copy

In [12]:
df_copy = rolling_mean(df_copy)

In [4]:
# generation_fossil_gas, eliminar valores atípicos
def outliers_generation_fossil_gas(df_copy):
    # Deteccion de atípicos 
    q1 = df_copy['generation_fossil_gas'].quantile(0.25)
    q3 = df_copy['generation_fossil_gas'].quantile(0.75)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr

    print('Antes de eliminar atípicos:',len(df_copy['generation_fossil_gas']))
    
    # Filtramos en pandas extrayendo los valores entre los quantiles
    filtered_df = df_copy[(df_copy['generation_fossil_gas'] >= Lower_tail)&(df_copy['generation_fossil_gas'] <= Upper_tail)]

    print('Despues de eliminar atípicos:',len(filtered_df))
    return filtered_df

In [5]:
df_copy=outliers_generation_fossil_gas(df_copy)

Antes de eliminar atípicos: 160564
Despues de eliminar atípicos: 150578


In [8]:
# Clase perteneciente a generation_fossil_hard_coal, kmeans y transformacion por distancias al centroide
class distance_transform_generation_fossil_hard_coal:
    def __init__(self, df_copy):
        self.df_copy = df_copy
    
    def scaler(self):
        scaler = StandardScaler()
        df_copy['generation_fossil_hard_coal']=scaler.fit_transform(self.df_copy[['generation_fossil_hard_coal']])
        return self.df_copy

    def kmeans_transform(self):
        self.scaler() # Escalamos en funcion
        # Clustering
        kmeans = KMeans(n_clusters=2, random_state=42)
        kmeans.fit(self.df_copy[['generation_fossil_hard_coal']])

        # Extraemos distancias
        distances = kmeans.transform(self.df_copy[['generation_fossil_hard_coal']])

        # Añadimos a la columna los nuevos valores
        self.df_copy['generation_fossil_hard_coal'] = distances
        
        return self.df_copy

In [9]:
distance_transform = distance_transform_generation_fossil_hard_coal(df_copy)
df_copy = distance_transform.kmeans_transform()

In [18]:
def gmm_total_load_actual(df_copy):
    data = df_copy['total_load_actual'].values.reshape(-1, 1)
    
    # Ajustar el Modelo de Mezcla Gaussiana
    gm = GaussianMixture(n_components=2, random_state=0).fit(data)
    labels = gm.predict(data)

    df_copy['total_load_actual'] = labels
    return df_copy

In [19]:
df_copy = gmm_total_load_actual(df_copy)

In [21]:
# generation_hydro_water_reservoir, transformación logarítmica
def logarithm_generation_hydro_run_of_river_and_poundage(df_copy):
    array1d = df_copy['generation_hydro_run_of_river_and_poundage'].values
    array2d = array1d.reshape(-1,1)
    df_copy['generation_hydro_run_of_river_and_poundage'] = power_transform(array2d, method='yeo-johnson', standardize=False)
    return df_copy

In [22]:
df_copy = logarithm_generation_hydro_run_of_river_and_poundage(df_copy)

In [25]:
def gmm_generation_other_renewable(df_copy):
    data = df_copy['generation_other_renewable'].values.reshape(-1, 1)
    
    # Ajustar el Modelo de Mezcla Gaussiana
    gm = GaussianMixture(n_components=2, random_state=0).fit(data)
    labels = gm.predict(data)

    df_copy['generation_other_renewable'] = labels
    return df_copy

In [26]:
df_copy = gmm_generation_other_renewable(df_copy)

In [27]:
# generation_waste, transformación logarítmica
def logarithm_generation_waste(df_copy):
    array1d = df_copy['generation_waste'].values
    array2d = array1d.reshape(-1,1)
    df_copy['generation_waste'] = power_transform(array2d, method='yeo-johnson', standardize=False)
    return df_copy

In [28]:
df_copy=logarithm_generation_waste(df_copy)

In [30]:
# generation_fossil_oil, eliminar valores atípicos
def outliers_generation_fossil_oil(df_copy):
    # Deteccion de atípicos 
    q1 = df_copy['generation_fossil_oil'].quantile(0.25)
    q3 = df_copy['generation_fossil_oil'].quantile(0.75)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr

    print('Antes de eliminar atípicos:',len(df_copy['generation_fossil_oil']))
    
    # Filtramos en pandas extrayendo los valores entre los quantiles
    filtered_df = df_copy[(df_copy['generation_fossil_oil'] >= Lower_tail)&(df_copy['generation_fossil_oil'] <= Upper_tail)]

    print('Despues de eliminar atípicos:',len(filtered_df))
    return filtered_df

In [31]:
df_copy = outliers_generation_fossil_oil(df_copy)

Antes de eliminar atípicos: 150578
Despues de eliminar atípicos: 149702


In [32]:
def gmm_generation_other(df_copy):
    data = df_copy['generation_other'].values.reshape(-1, 1)
    
    # Ajustar el Modelo de Mezcla Gaussiana
    gm = GaussianMixture(n_components=3, random_state=0).fit(data)
    labels = gm.predict(data)

    df_copy['generation_other'] = labels
    return df_copy

In [33]:
df_copy = gmm_generation_other(df_copy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['generation_other'] = labels


In [35]:
# generation_hydro_water_reservoir, transformación logarítmica
def logarithm_generation_hydro_water_reservoirl(df_copy):
    array1d = df_copy['generation_hydro_water_reservoir'].values
    array2d = array1d.reshape(-1,1)
    df_copy['generation_hydro_water_reservoir'] = power_transform(array2d, method='box-cox', standardize=False)
    return df_copy

In [36]:
df_copy = logarithm_generation_hydro_water_reservoirl(df_copy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['generation_hydro_water_reservoir'] = power_transform(array2d, method='box-cox', standardize=False)


In [38]:
# generation_wind_onshore, transformación logarítmica
def logarithm_generation_wind_onshore(df_copy):
    array1d = df_copy['generation_wind_onshore'].values
    array2d = array1d.reshape(-1,1)
    df_copy['generation_wind_onshore'] = power_transform(array2d, method='yeo-johnson', standardize=False)
    return df_copy

In [39]:
df_copy = logarithm_generation_wind_onshore(df_copy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_copy['generation_wind_onshore'] = power_transform(array2d, method='yeo-johnson', standardize=False)


In [40]:
df_copy.head()

Unnamed: 0,generation_fossil_gas,generation_fossil_hard_coal,total_load_actual,generation_nuclear,generation_hydro_run_of_river_and_poundage,generation_other_renewable,generation_waste,generation_fossil_oil,generation_other,generation_hydro_water_reservoir,generation_biomass,generation_solar,pressure,generation_wind_onshore,generation_hydro_pumped_storage_consumption,generation_fossil_brown_coal_lignite,temp_min,wind_speed,temp,temp_max
0,3836,1.538978,0,5064,20.123681,0,988778.9,284,0,16.372457,372,56,1018,55.519595,1,463,14.0,3.6,15.2,17.0
1,4345,1.465943,0,6727,17.001332,1,1039426.0,284,1,15.990127,572,3155,1017,55.127503,1473,203,23.0,3.6,28.5,40.0
2,4078,0.390265,1,7103,21.075366,0,1269682.0,324,0,21.918898,274,4111,1026,63.440782,192,0,14.0,10.8,15.0,16.0
3,7496,2.156171,1,6081,21.328854,0,1691404.0,362,0,21.770474,277,16,1017,66.081575,0,622,27.0,10.8,27.0,27.0
4,5167,0.1187,1,6838,16.865613,0,1316829.0,250,2,15.416567,347,2987,1010,67.090034,912,393,33.0,10.8,33.4,34.0
