In [1]:
import sys

# Añade el directorio principal al path de búsqueda para importar módulos desde esa ubicación
sys.path.insert(0, "..")

import pandas as pd
from likelihood import Pipeline

# Load raw data (replace with your actual data loading)
df = pd.read_parquet("pipeline_data.parquet")

# Load pipeline config and process data
config_path = "pipeline_config.json"
pipeline = Pipeline(config_path)




In [2]:
df.head(15)

Unnamed: 0,Name,Age,Income,CreditScore,Target,DebtRatio
0,John,25,50000,680.0,1,0.25
1,Alice,30,60000,720.0,0,0.18
2,Bob,22,45000,650.0,1,0.35
3,Diana,28,52000,700.0,0,0.22
4,Ethan,35,75000,750.0,1,0.15
5,Sophia,29,62000,,0,
6,Michael,33,70000,670.0,1,0.28
7,Olivia,27,48000,,0,
8,Liam,31,55000,690.0,1,0.19
9,Emma,26,59000,,0,


In [3]:
# Fit the pipeline on training data
X_train, y_train, importances = pipeline.fit(df.drop(columns=["Name"]))
print("Cleaned Features (X_train):")
X_train.insert(0, "Name", df["Name"])
print(X_train.head())
print("\nTarget Vector (y_train):", y_train)
if importances is not None:
    print("\nFeature Importances:", importances)

pipeline.save("./pipe")
pipeline = Pipeline.load("./pipe")

# Transform new data (e.g., validation/test set)
new_data = pd.DataFrame(
    {
        "Name": ["Bob", "Alice", "John", "Sophia", "Liam", "Emma", "Noah", "Ava", "James"],
        "Age": [35, 28, 42, 30, 38, 19, 31, 29, 40],
        "Income": [75000, 64000, 82000, 69000, 73000, 67000, 71000, 66000, 78000],
        "CreditScore": [700.0, 680.0, 710.0, None, 690.0, None, 705.0, None, 715.0],
        "DebtRatio": [0.18, 0.22, 0.15, None, 0.20, None, 0.19, None, 0.16],
    }
)

X_new = pipeline.transform(new_data.drop(columns=["Name"]))
X_new.insert(0, "Name", new_data["Name"])
print("\nTransformed New Data:")
print(X_new.head(10))

Cleaned Features (X_train):
    Name  CreditScore  DebtRatio Age_range  45000-74999  75000-104999
0   John    -0.333333   0.047619     22-31          1.0           0.0
1  Alice     0.333333  -0.619048     22-31          1.0           0.0
2    Bob    -0.833333   1.000000     22-31          1.0           0.0
3  Diana     0.000000  -0.238095     22-31          1.0           0.0
4  Ethan     0.833333  -0.904762     32-41          0.0           1.0

Target Vector (y_train): [1 0 1 0 1 0 1 0 1 0 1 0 1]

Feature Importances:    CreditScore  DebtRatio  45000-74999  75000-104999
0     0.137664   0.189795     0.274358      0.398183

Transformed New Data:
     Name  CreditScore  DebtRatio Age_range  45000-74999  75000-104999
0     Bob     0.000000  -0.619048     32-41          0.0           1.0
1   Alice    -0.333333  -0.238095     22-31          1.0           0.0
2    John     0.166667  -0.904762      > 41          0.0           1.0
3  Sophia    -1.000000  -0.142857     22-31          1.0       

In [4]:
pipeline.get_doc()

✅ Report saved to 'data_processing_report.html'
