In [None]:
# %pip install featuretools



In [3]:
import sys, os
from pathlib import Path

# 1) Obtener la ruta al directorio del proyecto (un nivel arriba de notebooks/)
project_root = Path().resolve().parent

# 2a) Opción A: que Python vea "src" como paquete:
sys.path.insert(0, str(project_root))

# 2b) Opción B (más directo), apuntar directamente a src/, y usar import desde eda:
# sys.path.insert(0, str(project_root / "src"))

print("PYTHONPATH:", sys.path[:3])

PYTHONPATH: ['C:\\Users\\leona\\source\\repos\\fau_predic_salario', 'c:\\Users\\leona\\anaconda3\\envs\\funda_apre_auto\\python310.zip', 'c:\\Users\\leona\\anaconda3\\envs\\funda_apre_auto\\DLLs']


In [4]:
import featuretools as ft
from featuretools.selection import (
    remove_low_information_features,
    remove_highly_correlated_features
)
import pandas as pd
from src.eda.eda import merge_multiple_dataframes
#Configuración de parametros de Pandas para mejor visualización
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.precision', 3)
pd.set_option('plotting.backend', 'matplotlib') 
pd.options.mode.chained_assignment = None

# En produccion eliminar, se incluye por informacion
print(f'Pandas Version: {pd.__version__}') 

# En produccion eliminar, se incluye por informacion
print(f'Featuretools Version: {ft.__version__}') 

Pandas Version: 2.1.4
Featuretools Version: 1.31.0


In [5]:
# Cargar los datos
file_paths = ['../data/descriptions.csv', '../data/people.csv', '../data/salary.csv']
merged_df = merge_multiple_dataframes(file_paths, 'id')

df_people      = pd.read_csv("../data/people.csv")
df_descriptions = pd.read_csv("../data/descriptions.csv")
df_salary      = pd.read_csv("../data/salary.csv")

In [6]:
merged_df.columns

Index(['id', 'Description', 'Age', 'Gender', 'Education Level', 'Job Title',
       'Years of Experience', 'Salary'],
      dtype='object')

In [8]:
# 1) Cargo los CSVs
df_people       = pd.read_csv("../data/people.csv")
df_descriptions = pd.read_csv("../data/descriptions.csv")
df_salary       = pd.read_csv("../data/salary.csv")

entities = {
    "customers" : (df_people, "customer_id"),
    "sessions" : (df_descriptions, "session_id"),
    "transactions" : (df_salary, "transaction_id")
}

# 2) Creo el EntitySet
es = ft.EntitySet(id="employee_data")

# 3) Agrego la tabla padre “people”, usando su propia PK “id”
es = es.add_dataframe(
    dataframe_name="people",
    dataframe=df_people,
    index="id"                     # aquí sí es clave primaria única
)

# 4) Agrego cada tabla hija con un índice nuevo único
es = es.add_dataframe(
    dataframe_name="descriptions",
    dataframe=df_descriptions,
    make_index=True,               # le dice a FT que cree una columna autonumérica
    index="descriptions_index"     # nombre de ese nuevo índice único
)

es = es.add_dataframe(
    dataframe_name="salary",
    dataframe=df_salary,
    make_index=True,
    index="salary_index"
)

# 5) Ahora sí, defino las relaciones usando la columna “id” de cada hijo
es = es.add_relationship(
    parent_dataframe_name="people",
    parent_column_name="id",
    child_dataframe_name="descriptions",
    child_column_name="id"         # aquí “id” es FK que coincide con people.id
)

es = es.add_relationship(
    parent_dataframe_name="people",
    parent_column_name="id",
    child_dataframe_name="salary",
    child_column_name="id"
)

# 6) Y ya puedes hacer tu DFS normalmente
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="people",
    agg_primitives=["mean", "count", "max", "min"],
    max_depth=1
)

# feature_matrix_sessions, features_defs = ft.dfs(
#     dataframes=dataframes, relationships=relationships, target_dataframe_name="sessions"
# )

print(feature_matrix.head())


  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


    Age  Gender Education Level  Years of Experience  COUNT(descriptions)  \
id                                                                          
0    32    Male      Bachelor's                  5.0                    1   
1    28  Female        Master's                  3.0                    1   
2    45    Male             PhD                 15.0                    1   
3    36  Female      Bachelor's                  7.0                    1   
4    52    Male        Master's                 20.0                    1   

    COUNT(salary)  MAX(salary.Salary)  MEAN(salary.Salary)  MIN(salary.Salary)  
id                                                                              
0               1             90000.0              90000.0             90000.0  
1               1             65000.0              65000.0             65000.0  
2               1            150000.0             150000.0            150000.0  
3               1             60000.0              6000