# 🔧 PASO 1 – Preparar dataset
Buscando FE

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# Función para crear features avanzadas
def create_advanced_features(df, columnas_tn):
    """
    Crea features avanzadas para regresión lineal
    """
    df_features = df.copy()
    
    # 1. FEATURES POLINÓMICAS
    print("1. Creando features polinómicas...")
    for col in columnas_tn:
        # Cuadráticas
        df_features[f'{col}_squared'] = df_features[col] ** 2
        # Cúbicas
        df_features[f'{col}_cubed'] = df_features[col] ** 3
        # Raíz cuadrada (para valores positivos)
        df_features[f'{col}_sqrt'] = np.sqrt(np.abs(df_features[col]))
        # Logarítmicas (para valores positivos)
        df_features[f'{col}_log'] = np.log1p(np.abs(df_features[col]))
    
    # 2. FEATURES DE INTERACCIÓN
    print("2. Creando features de interacción...")
    for i, col1 in enumerate(columnas_tn):
        for j, col2 in enumerate(columnas_tn[i+1:], i+1):
            # Multiplicación
            df_features[f'{col1}_x_{col2}'] = df_features[col1] * df_features[col2]
            # División (evitando división por cero)
            df_features[f'{col1}_div_{col2}'] = df_features[col1] / (df_features[col2] + 1e-8)
            # Suma
            df_features[f'{col1}_plus_{col2}'] = df_features[col1] + df_features[col2]
            # Diferencia
            df_features[f'{col1}_minus_{col2}'] = df_features[col1] - df_features[col2]
    
    # 3. FEATURES ESTADÍSTICAS
    print("3. Creando features estadísticas...")
    # Media de todas las columnas
    df_features['mean_all'] = df_features[columnas_tn].mean(axis=1)
    # Mediana
    df_features['median_all'] = df_features[columnas_tn].median(axis=1)
    # Desviación estándar
    df_features['std_all'] = df_features[columnas_tn].std(axis=1)
    # Máximo
    df_features['max_all'] = df_features[columnas_tn].max(axis=1)
    # Mínimo
    df_features['min_all'] = df_features[columnas_tn].min(axis=1)
    # Rango
    df_features['range_all'] = df_features['max_all'] - df_features['min_all']
    # Suma total
    df_features['sum_all'] = df_features[columnas_tn].sum(axis=1)
    # Coeficiente de variación
    df_features['cv_all'] = df_features['std_all'] / (df_features['mean_all'] + 1e-8)
    
    # 4. FEATURES DE RANKING
    print("4. Creando features de ranking...")
    for col in columnas_tn:
        # Rank de cada columna
        df_features[f'{col}_rank'] = df_features[col].rank()
        # Percentil
        df_features[f'{col}_percentile'] = df_features[col].rank(pct=True)
    
    # 5. FEATURES BINARIAS/CATEGÓRICAS
    print("5. Creando features binarias...")
    for col in columnas_tn:
        # Indicador si es mayor que la media
        df_features[f'{col}_above_mean'] = (df_features[col] > df_features[col].mean()).astype(int)
        # Indicador si es positivo
        df_features[f'{col}_positive'] = (df_features[col] > 0).astype(int)
        # Cuartiles
        df_features[f'{col}_quartile'] = pd.qcut(df_features[col], 4, labels=[1,2,3,4])
    
    # 6. FEATURES DE CLUSTERING/AGRUPACIÓN
    print("6. Creando features de clustering...")
    # Distancia euclidiana desde el centroide
    centroid = df_features[columnas_tn].mean()
    df_features['distance_from_centroid'] = np.sqrt(((df_features[columnas_tn] - centroid) ** 2).sum(axis=1))
    
    # 7. FEATURES TEMPORALES (si hay columna de fecha)
    if 'ds' in df_features.columns:
        print("7. Creando features temporales...")
        df_features['ds'] = pd.to_datetime(df_features['ds'])
        df_features['year'] = df_features['ds'].dt.year
        df_features['month'] = df_features['ds'].dt.month
        df_features['quarter'] = df_features['ds'].dt.quarter
        df_features['day_of_year'] = df_features['ds'].dt.dayofyear
        df_features['month_sin'] = np.sin(2 * np.pi * df_features['month'] / 12)
        df_features['month_cos'] = np.cos(2 * np.pi * df_features['month'] / 12)
    
    return df_features

# Función para seleccionar las mejores features
def select_best_features(X, y, max_features=50):
    """
    Selecciona las mejores features usando correlación
    """
    # Calcular correlaciones con la variable objetivo
    correlations = X.corrwith(y).abs().sort_values(ascending=False)
    
    # Seleccionar las top features
    best_features = correlations.head(max_features).index.tolist()
    
    print(f"Top 10 features por correlación:")
    for i, (feature, corr) in enumerate(correlations.head(10).items()):
        print(f"{i+1:2d}. {feature}: {corr:.4f}")
    
    return best_features

# Función para entrenar modelo con features avanzadas
def train_advanced_linear_regression(df_train, df_test, columnas_tn, target='clase'):
    """
    Entrena regresión lineal con features avanzadas
    """
    print("=== ENTRENANDO MODELO CON FEATURES AVANZADAS ===")
    
    # Crear features avanzadas
    df_train_features = create_advanced_features(df_train, columnas_tn)
    df_test_features = create_advanced_features(df_test, columnas_tn)
    
    # Obtener todas las columnas numéricas (excluyendo target)
    numeric_cols = df_train_features.select_dtypes(include=[np.number]).columns
    feature_cols = [col for col in numeric_cols if col != target]
    
    print(f"Features disponibles: {len(feature_cols)}")
    
    # Preparar datos
    X_train_all = df_train_features[feature_cols].fillna(0)
    y_train = df_train_features[target]
    
    X_test_all = df_test_features[feature_cols].fillna(0)
    y_test = df_test_features[target]
    
    # Seleccionar mejores features
    best_features = select_best_features(X_train_all, y_train, max_features=30)
    
    X_train = X_train_all[best_features]
    X_test = X_test_all[best_features]
    
    # Escalar features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Entrenar modelo
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    # Predicciones
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # Métricas
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"\n=== RESULTADOS ===")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test RMSE:  {test_rmse:.4f}")
    print(f"Train R²:   {train_r2:.4f}")
    print(f"Test R²:    {test_r2:.4f}")
    
    # Importancia de features
    feature_importance = pd.DataFrame({
        'feature': best_features,
        'coefficient': model.coef_,
        'abs_coefficient': np.abs(model.coef_)
    }).sort_values('abs_coefficient', ascending=False)
    
    print(f"\nTop 10 features más importantes:")
    for i, row in feature_importance.head(10).iterrows():
        print(f"{i+1:2d}. {row['feature']}: {row['coefficient']:8.4f}")
    
    return model, scaler, best_features, feature_importance

# Ejemplo de uso con features polinómicas automáticas
def train_polynomial_regression(df_train, df_test, columnas_tn, target='clase', degree=2):
    """
    Entrena regresión polinómica usando PolynomialFeatures
    """
    print(f"=== ENTRENANDO REGRESIÓN POLINÓMICA (grado {degree}) ===")
    
    # Preparar datos base
    X_train = df_train[columnas_tn].fillna(0)
    y_train = df_train[target]
    X_test = df_test[columnas_tn].fillna(0)
    y_test = df_test[target]
    
    # Crear features polinómicas
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    print(f"Features originales: {X_train.shape[1]}")
    print(f"Features polinómicas: {X_train_poly.shape[1]}")
    
    # Escalar
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_poly)
    X_test_scaled = scaler.transform(X_test_poly)
    
    # Entrenar
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)
    
    # Predicciones
    y_pred_train = model.predict(X_train_scaled)
    y_pred_test = model.predict(X_test_scaled)
    
    # Métricas
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"\nResultados:")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test RMSE:  {test_rmse:.4f}")
    print(f"Train R²:   {train_r2:.4f}")
    print(f"Test R²:    {test_r2:.4f}")
    
    return model, scaler, poly

# EJEMPLO DE USO
print("=== TIPOS DE FEATURES PARA REGRESIÓN LINEAL ===")
print("1. Features Polinómicas: x², x³, √x, log(x)")
print("2. Features de Interacción: x₁*x₂, x₁/x₂, x₁+x₂, x₁-x₂")
print("3. Features Estadísticas: mean, median, std, max, min, range")
print("4. Features de Ranking: rank, percentile")
print("5. Features Binarias: x > mean, x > 0, quartiles")
print("6. Features de Clustering: distancia desde centroide")
print("7. Features Temporales: year, month, quarter, sin/cos")

# Comparación de modelos
def compare_models(df_train, df_test, columnas_tn, target='clase'):
    """
    Compara diferentes enfoques de feature engineering
    """
    results = []
    
    # 1. Modelo base (sin features adicionales)
    X_train = df_train[columnas_tn].fillna(0)
    y_train = df_train[target]
    X_test = df_test[columnas_tn].fillna(0)
    y_test = df_test[target]
    
    model_base = LinearRegression()
    model_base.fit(X_train, y_train)
    y_pred = model_base.predict(X_test)
    rmse_base = np.sqrt(mean_squared_error(y_test, y_pred))
    r2_base = r2_score(y_test, y_pred)
    
    results.append({
        'model': 'Base (sin features)',
        'features': len(columnas_tn),
        'rmse': rmse_base,
        'r2': r2_base
    })
    
    # 2. Modelo con features avanzadas
    try:
        _, _, best_features, _ = train_advanced_linear_regression(df_train, df_test, columnas_tn, target)
        # Aquí agregarías los resultados del modelo avanzado
    except:
        pass
    
    # 3. Modelo polinómico
    try:
        model_poly, _, _ = train_polynomial_regression(df_train, df_test, columnas_tn, target, degree=2)
        # Aquí agregarías los resultados del modelo polinómico
    except:
        pass
    
    print(f"\n=== COMPARACIÓN DE MODELOS ===")
    for result in results:
        print(f"{result['model']:20s} | Features: {result['features']:3d} | RMSE: {result['rmse']:.4f} | R²: {result['r2']:.4f}")
    
    return results

In [47]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Crear función de preparación con lags y clase
def preparar_dataset_para_regresion(df):
    df = df.sort_values(['product_id', 'periodo'])
    
    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)
    
    # Agregar campo tn_1 * tn
    # df['tnt1'] = df['tn'] / df['tn_1']
    # df['tn1tn2'] = df['tn_1'] / df['tn_2']

    # Crear campo "clase" con tn en periodo+2
    df['clase'] = df.groupby('product_id')['tn'].shift(-2)
    
    return df

In [48]:
df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
df_prep = preparar_dataset_para_regresion(df)  # Preparar el dataset
df_prep

Unnamed: 0,periodo,product_id,tn,tn_1,tn_2,tn_3,tn_4,tn_5,tn_6,tn_7,tn_8,tn_9,tn_10,tn_11,clase
0,201701,20001,934.77222,,,,,,,,,,,,1303.35771
785,201702,20001,798.01620,934.77222,,,,,,,,,,,1069.96130
1566,201703,20001,1303.35771,798.01620,934.77222,,,,,,,,,,1502.20132
2352,201704,20001,1069.96130,1303.35771,798.01620,934.77222,,,,,,,,,1520.06539
3136,201705,20001,1502.20132,1069.96130,1303.35771,798.01620,934.77222,,,,,,,,1030.67391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
783,201701,21295,0.00699,,,,,,,,,,,,
6435,201708,21296,0.00651,,,,,,,,,,,,
784,201701,21297,0.00579,,,,,,,,,,,,
6436,201708,21298,0.00573,,,,,,,,,,,,


# 📚 PASO 2 – Entrenar regresión lineal

In [49]:
import pandas as pd
import numpy as np

def get_productos_36_meses(magicos, n_random=30, seed=42):
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()  # Agrupar por periodo y producto

    # Paso 1: calcular la cantidad de registros por product_id
    conteo = df.groupby('product_id').size()

    # Paso 2: quedarte solo con los que tienen 36 registros
    productos_36_meses = conteo[conteo == 36].index.tolist()

    # Paso 3: eliminar productos mágicos
    productos_restantes = list(set(productos_36_meses) - set(magicos))

    # Paso 4: elegir aleatoriamente N productos distintos a los mágicos
    np.random.seed(seed)
    productos_random = np.random.choice(productos_restantes, size=n_random, replace=False).tolist()

    # Paso 5: concatenar mágicos + aleatorios
    productos_finales = magicos + productos_random

    # Paso 6: filtrar el DataFrame con esos productos
    df_filtrado = df[df['product_id'].isin(productos_finales)].copy()

    return df_filtrado


In [50]:
# Lista de productos mágicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]


get_productos_36_meses(magicos)['product_id'].unique()

array([20001, 20002, 20003, 20006, 20008, 20010, 20011, 20017, 20018,
       20019, 20021, 20023, 20026, 20028, 20035, 20037, 20039, 20042,
       20044, 20045, 20046, 20051, 20052, 20053, 20055, 20086, 20109,
       20114, 20118, 20158, 20167, 20176, 20180, 20181, 20193, 20272,
       20320, 20321, 20358, 20361, 20400, 20410, 20416, 20434, 20454,
       20465, 20466, 20501, 20532, 20568, 20612, 20637, 20672, 20680,
       20684, 20685, 20807, 20820, 20838, 20864, 20986, 21218, 20049])

In [61]:
# Entrenar regresión lineal en periodo base con productos mágicos
def entrenar_regresion_lineal(df, periodo_base, magicos):
    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)] # + ['tnt1'] + ['tn1tn2']
    
    # Filtrar registros del periodo base y productos mágicos
    df_train = df[(df['periodo'].isin(periodo_base)) & (df['product_id'].isin(magicos))].copy()
    
    # Eliminar filas con datos faltantes
    df_train = df_train.dropna(subset=columnas_tn + ['clase'])

    # Entrenar modelo
    X = df_train[columnas_tn]
    y = df_train['clase']
    model = LinearRegression()
    model.fit(X, y)

    # Mostrar coeficientes
    coef = pd.Series([model.intercept_] + model.coef_.tolist(), index=['intercept'] + columnas_tn)
    return model, coef


# 🔮 PASO 3 – Predecir con el modelo entrenado

In [52]:
def productos_con_historia():
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir

    contador = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    contador = contador.groupby('product_id').size().reset_index(name='count')
    product_id = contador[contador['count']==12]['product_id'].unique()

    return product_id

In [None]:
# Predecir para un periodo futuro
def predecir_regresion(model):
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn': 'sum'}).reset_index()
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()
    df = df[(df['periodo'] >= 201901) & (df['periodo'] <= 201912)]
    df = df[df['product_id'].isin(productos_con_historia())]
    
    df = df.sort_values(['product_id', 'periodo'])

    # Generar lags tn_1 a tn_11 por producto
    for i in range(1, 12):
        df[f'tn_{i}'] = df.groupby('product_id')['tn'].shift(i)

    # df['tnt1'] = df['tn_1'] / df['tn_6']
    # df['tn1tn2'] = df['tn_1'] / df['tn_2']

    columnas_tn = ['tn'] + [f'tn_{i}' for i in range(1, 12)] # + ['tnt1'] + ['tn1tn2']

    # Filtrar periodo de predicción
    df_pred = df[df['periodo'] == 201912].copy()

    # Asegurar que no haya nulls
    df_pred = df_pred.dropna(subset=columnas_tn)

    # Predecir
    df_pred['pred'] = model.predict(df_pred[columnas_tn])

    return df_pred
 


def predecir_no_completos():
    
    df = pd.read_csv('../../data/raw/sell-in.csv', sep='\t')  # Cargar el dataset
    df = df.groupby(['periodo', 'product_id']).agg({'tn' : 'sum'}).reset_index() # Agrupar por periodo y producto
    productos = pd.read_csv('../../data/raw/product_id_apredecir201912.csv', sep=',')  # Cargar productos
    df = df[df['product_id'].isin(productos['product_id'].unique())].copy()  # Filtrar productos a predecir
    df = df[(df['periodo']>=201901) & (df['periodo']<=201912) ] 
    df = df[~df['product_id'].isin(productos_con_historia())]
    
    df = df.sort_values(['product_id', 'periodo'])
    
    df = df.groupby('product_id').agg({'tn':'mean'}).reset_index()  # Tomar los últimos 12 meses de cada producto
    
    return df

In [49]:
# prods = predecir_regresion(modelo)
# prods.sort_values(['product_id','periodo'], inplace=True)
# prods

# ▶️ PASO 4 – Ejecutar todo el flujo

In [59]:
# Paso A: preparar dataset (con tus datos originales en df)
# df_prep = preparar_dataset_para_regresion(df)

magicos_aux = get_productos_36_meses(magicos,n_random=0)['product_id'].unique()
meses = [201812]
# Paso B: entrenar el modelo con periodo 201812
modelo, coeficientes = entrenar_regresion_lineal(df_prep, periodo_base=meses, magicos=magicos_aux)
print("Coeficientes encontrados:")
print(coeficientes)

# Paso C: predecir para registros del periodo 201912 → objetivo: predecir 202002
df_pred_con_historia = predecir_regresion(modelo)
df_pred_con_historia = df_pred_con_historia[['product_id', 'pred']].rename(columns={'pred': 'tn'})

df_pred_sin_historia = predecir_no_completos()

df_pred = pd.concat([df_pred_con_historia, df_pred_sin_historia], ignore_index=True)

# Paso D: ver primeras predicciones
df_pred

Coeficientes encontrados:
intercept    2.781344
tn          -0.005202
tn_1        -0.000981
tn_2        -0.002027
tn_3         0.007222
tn_4         0.000969
tn_5        -0.002012
tn_6         0.006808
tn_7         0.000858
tn_8        -0.000254
tn_9        -0.000237
tn_10        0.004844
tn_11       -0.003216
dtype: float64


Unnamed: 0,product_id,tn
0,20001,34037.661002
1,20002,341.220255
2,20003,647.248354
3,20004,1073.608948
4,20005,16804.925255
...,...,...
775,21252,0.178011
776,21265,0.089541
777,21266,0.094659
778,21267,0.092835


# 💾 PASO 5 – Exportar a CSV (opcional)

In [60]:
# # Exportar archivo para enviar a Kaggle (ajustar columnas si necesario)
df_pred[['product_id', 'tn']].to_csv("./outputs/predicciones_regresion_lineal_v8_tnt1.csv", index=False)