In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import os

ticker = "AAPL"
print(f"--- Calculando Factores para {ticker} ---\n")

# Cargar datos procesados
df = pd.read_csv(f"../data/processed/{ticker}_ready_for_features.csv")
df['date_market'] = pd.to_datetime(df['date_market'])
df['date_accounting'] = pd.to_datetime(df['date_accounting'])

print(f"üìä Datos cargados: {len(df)} observaciones")
print(f"   Rango: {df['date_market'].min()} a {df['date_market'].max()}\n")

# --- OBTENER SHARES OUTSTANDING HIST√ìRICOS ---
print("üîç Obteniendo shares outstanding hist√≥ricos...")
try:
    empresa = yf.Ticker(ticker)
    # Intentar obtener shares outstanding del balance sheet
    balance = empresa.balance_sheet.T
    if 'Ordinary Shares Number' in balance.columns:
        shares_historicos = balance[['Ordinary Shares Number']].copy()
        shares_historicos.index = pd.to_datetime(shares_historicos.index)
        shares_historicos.columns = ['shares_outstanding']
        
        # Merge con df usando merge_asof
        df_sorted = df.sort_values('date_accounting')
        shares_sorted = shares_historicos.sort_index()
        
        df = pd.merge_asof(
            df_sorted,
            shares_sorted,
            left_on='date_accounting',
            right_index=True,
            direction='backward'
        )
        print(f"‚úÖ Shares outstanding hist√≥ricos obtenidos")
    else:
        print("‚ö†Ô∏è  Shares outstanding no disponible en balance, usando aproximaci√≥n")
        df['shares_outstanding'] = empresa.info.get('sharesOutstanding', 15_500_000_000)
except Exception as e:
    print(f"‚ö†Ô∏è  Error obteniendo shares: {e}")
    print("   Usando valor aproximado constante")
    df['shares_outstanding'] = 15_500_000_000  # Aproximaci√≥n para Apple

# --- C√ÅLCULO DE FACTORES DEL PAPER ---
print("\nüßÆ Calculando factores del paper...\n")

# 1. MARKET EQUITY (me) - P√°g 27 Tabla 8
df['me'] = df['Close'] * df['shares_outstanding']
print("‚úì Market Equity (me)")

# 2. BOOK EQUITY (be) - M√°s preciso seg√∫n el paper
# be = seq + txditc (deferred taxes) - pstk (preferred stock)
# Simplificado aqu√≠ como seq (stockholders equity)
df['be'] = df['seq'].fillna(df['at'] - df['lt'])  # Fallback: Assets - Liabilities
print("‚úì Book Equity (be)")

# 3. BOOK-TO-MARKET (be_me) - Factor de Valor cl√°sico
df['be_me'] = df['be'] / df['me']
print("‚úì Book-to-Market (be_me)")

# 4. PROFITABILITY RATIOS
# Gross Profit to Assets (gp_at) - P√°g 19 Tabla 6
df['gp'] = df['sale'] - df['cogs']
df['gp_at'] = df['gp'] / df['at']
print("‚úì Gross Profit to Assets (gp_at)")

# Operating Profitability (op_at) - EBIT / Assets
df['op_at'] = df['ebit'] / df['at']
print("‚úì Operating Profitability (op_at)")

# Return on Assets (roa)
df['roa'] = df['ni'] / df['at']
print("‚úì Return on Assets (roa)")

# Return on Equity (roe)
df['roe'] = df['ni'] / df['be']
print("‚úì Return on Equity (roe)")

# 5. INVESTMENT FACTORS
# Asset Growth (at_gr1) - Crecimiento anual de activos
# Nota: Usamos cambios entre reportes, no d√≠as de trading
df_temp = df[['date_accounting', 'at']].drop_duplicates(subset='date_accounting')
df_temp = df_temp.sort_values('date_accounting')
df_temp['at_lag1y'] = df_temp['at'].shift(1)  # Reporte anterior (anual t√≠picamente)
df_temp['at_gr1'] = (df_temp['at'] / df_temp['at_lag1y']) - 1

# Merge de vuelta
df = df.merge(
    df_temp[['date_accounting', 'at_gr1']], 
    on='date_accounting', 
    how='left',
    suffixes=('', '_new')
)
if 'at_gr1_new' in df.columns:
    df['at_gr1'] = df['at_gr1_new']
    df.drop('at_gr1_new', axis=1, inplace=True)

print("‚úì Asset Growth (at_gr1)")

# Investment to Assets (inv_at) - CapEx / Assets
df['inv_at'] = df['capx'] / df['at']
print("‚úì Investment to Assets (inv_at)")

# 6. LEVERAGE & LIQUIDITY
# Debt to Assets (debt_at)
df['debt_at'] = df['lt'] / df['at']
print("‚úì Debt to Assets (debt_at)")

# Current Ratio - Proxy: Cash / Total Assets (simplificado)
df['cash_at'] = df['che'] / df['at']
print("‚úì Cash to Assets (cash_at)")

# 7. EFFICIENCY RATIOS
# Asset Turnover (sale_at)
df['sale_at'] = df['sale'] / df['at']
print("‚úì Asset Turnover (sale_at)")

# Inventory Turnover (sale_inv)
df['sale_inv'] = df['sale'] / df['inv'].replace(0, np.nan)
print("‚úì Inventory Turnover (sale_inv)")

# Receivables Turnover (sale_rect)
df['sale_rect'] = df['sale'] / df['rect'].replace(0, np.nan)
print("‚úì Receivables Turnover (sale_rect)")

# 8. MOMENTUM & PRICE FACTORS (usando datos de mercado)
# Return 1 mes (21 d√≠as aprox)
df['ret_1m'] = df['Close'].pct_change(21)
print("‚úì Return 1 Month (ret_1m)")

# Return 12 meses (252 d√≠as)
df['ret_12m'] = df['Close'].pct_change(252)
print("‚úì Return 12 Months (ret_12m)")

# Volatility 1 mes (desviaci√≥n est√°ndar de retornos)
df['ret_daily'] = df['Close'].pct_change()
df['vol_1m'] = df['ret_daily'].rolling(21).std()
print("‚úì Volatility 1 Month (vol_1m)")

# --- LIMPIEZA FINAL ---
print("\nüßπ Limpiando datos...")

# Reemplazar infinitos con NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Winsorizaci√≥n al 1% y 99% (como en el paper) - Opcional pero recomendado
def winsorize(series, lower=0.01, upper=0.99):
    """Limita valores extremos a percentiles dados"""
    if series.isna().all():
        return series
    lower_bound = series.quantile(lower)
    upper_bound = series.quantile(upper)
    return series.clip(lower=lower_bound, upper=upper_bound)

# Lista de factores a winsorizar
factores_winsorizar = ['be_me', 'gp_at', 'op_at', 'roa', 'roe', 'at_gr1', 
                        'inv_at', 'debt_at', 'sale_at', 'sale_inv', 'sale_rect',
                        'ret_1m', 'ret_12m']

print("   Aplicando winsorizaci√≥n (1%-99%) a factores financieros...")
for factor in factores_winsorizar:
    if factor in df.columns:
        df[f'{factor}_raw'] = df[factor]  # Guardar versi√≥n original
        df[factor] = winsorize(df[factor])

# Estad√≠sticas de valores faltantes
print("\nüìä Estad√≠sticas de completitud de datos:")
factores_principales = ['me', 'be_me', 'gp_at', 'op_at', 'roa', 'roe', 
                        'at_gr1', 'inv_at', 'debt_at', 'ret_1m', 'ret_12m']

for factor in factores_principales:
    if factor in df.columns:
        pct_valid = (df[factor].notna().sum() / len(df)) * 100
        print(f"   {factor:15s}: {pct_valid:5.1f}% completo")

# --- GUARDAR RESULTADOS ---
output_path = f"../data/processed/{ticker}_factors_calculated.csv"
df.to_csv(output_path, index=False)

print(f"\n‚úÖ Factores calculados y guardados en: {output_path}")
print(f"   Total de columnas: {len(df.columns)}")
print(f"   Total de filas: {len(df)}")

# --- VISTA PREVIA ---
print("\n--- Vista Previa de Factores (√öltimas 5 observaciones) ---")
cols_vista = ['date_market', 'Close', 'me', 'be_me', 'gp_at', 'roa', 
              'at_gr1', 'ret_1m', 'ret_12m']
cols_disponibles = [col for col in cols_vista if col in df.columns]
print(df[cols_disponibles].tail().to_string())

# --- RESUMEN ESTAD√çSTICO ---
print("\n--- Estad√≠sticas Descriptivas de Factores ---")
factores_stats = [col for col in factores_principales if col in df.columns]
print(df[factores_stats].describe().loc[['mean', 'std', 'min', 'max']].to_string())

print("\nüéâ C√°lculo de factores completado!")