In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Directorio donde están los CSV
folder_path = r"C:\Users\Ibon\PycharmProjects\Microgrid\objective2_data_cleaned"

# Ubicaciones y tipos de archivos
locations = ["Cocoa", "Eugene", "Golden"]
file_types = ["aSiMicro03036", "aSiTandem72-46", "aSiTriple28324", "CdTe75638", "CIGS8-001", "CIGS39017", "HIT05667", "mSi0166", "mSi0188", "mSi460A8", "xSi12922"]

# Diccionarios para almacenar los datos
data = {loc: [] for loc in locations}

def load_data():
    for file in os.listdir(folder_path):
        for loc in locations:
            if file.startswith(loc):
                df = pd.read_csv(os.path.join(folder_path, file))
                df["source"] = file  # Agregar columna de origen para trazabilidad
                data[loc].append(df)

# Cargar los datos
load_data()

# Separar train y test para cada ubicación
train_data, test_data = {}, {}
for loc in locations:
    combined_df = pd.concat(data[loc], ignore_index=True)
    if "Pmp (W)" not in combined_df.columns:
        raise ValueError(f"La columna objetivo 'Pmp (W)' no se encuentra en los datos de {loc}")
    
    X = combined_df.drop(columns=["Pmp (W)"])
    y = combined_df["Pmp (W)"]
    
    # Se usa shuffle en lugar de stratify si hay pocas muestras en algunas clases
    train, test = train_test_split(combined_df, test_size=0.2, random_state=42, shuffle=True)
    train_data[loc] = train
    test_data[loc] = test

# Guardar archivos
for loc in locations:
    train_data[loc].to_csv(f"train_{loc.lower()}.csv", index=False)
    test_data[loc].to_csv(f"test_{loc.lower()}.csv", index=False)

# Crear los conjuntos X e y
X_train = pd.concat([train_data[loc].drop(columns=["Pmp (W)"]) for loc in locations])
X_test = pd.concat([test_data[loc].drop(columns=["Pmp (W)"]) for loc in locations])
y_train = pd.concat([train_data[loc]["Pmp (W)"] for loc in locations])
y_test = pd.concat([test_data[loc]["Pmp (W)"] for loc in locations])


In [5]:
train_cocoa = pd.read_csv("train_cocoa.csv")
test_cocoa = pd.read_csv("test_cocoa.csv")
train_eugene = pd.read_csv("train_eugene.csv")
test_eugene = pd.read_csv("test_eugene.csv")
train_gold = pd.read_csv("train_golden.csv")
test_gold = pd.read_csv("test_golden.csv")

print(len(train_cocoa))
print(len(test_cocoa))
print(len(train_eugene))
print(len(test_eugene))
print(len(train_gold))
print(len(test_gold))

321317
80330
346356
86590
68556
17139


In [6]:
# Función para mostrar la distribución de 'source'
def print_source_distribution(df, name):
    print(f"\nDistribución de 'source' en {name}:")
    print(df['source'].value_counts())
    print(f"Número de fuentes únicas: {df['source'].nunique()}")

# Aplicar la función a cada conjunto
print_source_distribution(train_cocoa, "train_cocoa")
print_source_distribution(test_cocoa, "test_cocoa")

print_source_distribution(train_eugene, "train_eugene")
print_source_distribution(test_eugene, "test_eugene")

print_source_distribution(train_gold, "train_golden")
print_source_distribution(test_gold, "test_golden")


Distribución de 'source' en train_cocoa:
source
Cocoa_aSiTandem72-46.csv    29914
Cocoa_mSi0188.csv           29834
Cocoa_CdTe75638.csv         29775
Cocoa_aSiMicro03036.csv     29766
Cocoa_xSi12922.csv          29693
Cocoa_mSi460A8.csv          29654
Cocoa_CIGS8-001.csv         29603
Cocoa_aSiTriple28324.csv    29326
Cocoa_HIT05667.csv          29249
Cocoa_mSi0166.csv           27958
Cocoa_CIGS39017.csv         26545
Name: count, dtype: int64
Número de fuentes únicas: 11

Distribución de 'source' en test_cocoa:
source
Cocoa_CIGS8-001.csv         7498
Cocoa_xSi12922.csv          7438
Cocoa_mSi460A8.csv          7435
Cocoa_CdTe75638.csv         7433
Cocoa_aSiMicro03036.csv     7414
Cocoa_mSi0188.csv           7410
Cocoa_aSiTandem72-46.csv    7406
Cocoa_aSiTriple28324.csv    7371
Cocoa_HIT05667.csv          7300
Cocoa_mSi0166.csv           7022
Cocoa_CIGS39017.csv         6603
Name: count, dtype: int64
Número de fuentes únicas: 11

Distribución de 'source' en train_eugene:
source
Eugene