# Modelaje de datos
En esta notebook realizaremos el modelo final con los datos resultantes de la capa Gold. En este caso es para el activo financiero Apple.Esta notebook debe ser ejecutada en Azure Synapse.

In [None]:
# Parameters
asset = 'apple'   

In [None]:
storage_account_name = 'nticmasterstg' 
data_lake_container = f'abfss://datalake@{storage_account_name}.dfs.core.windows.net' 
gold_folder = 'gold' # Directorio final

gold_table_path = f"{data_lake_container}/{gold_folder}/{asset}" 

In [None]:
import numpy as np
import pandas as pd
import pickle
from azure.storage.blob import BlobServiceClient, ContainerClient
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Carga de datos
data = spark.read.format("delta").load(gold_table_path)
data = data.toPandas()

# Aplicar la transformación logarítmica
data['Log_Volume'] = np.log(data['Volume'])
data['Log_Adj_Close'] = np.log(data['Adj_Close'])
data['Log_7_day_moving_avg'] = np.log(data['7_day_moving_avg'])
data['Log_30_day_moving_avg'] = np.log(data['30_day_moving_avg'])

# Creamos variables laggeadas
for lag in range(1, 4):
    data[f'Log_Adj_Close_Lag{lag}'] = data['Log_Adj_Close'].shift(lag)
    data[f'Log_Volume_Lag{lag}'] = data['Log_Volume'].shift(lag)
    data[f'Log_7_day_moving_avg_Lag{lag}'] = data['Log_7_day_moving_avg'].shift(lag)
    data[f'Log_30_day_moving_avg_Lag{lag}'] = data['Log_30_day_moving_avg'].shift(lag)
    data[f'daily_return_Lag{lag}'] = data['daily_return'].shift(lag)

data = data.dropna()

features = ['Log_Adj_Close_Lag1', 'Log_Volume_Lag1', 
            'Log_7_day_moving_avg_Lag1',  
            'daily_return_Lag1',  
            'Three_Component_Index',
            'year', 'month',
             'monthly_close', 'monthly_volume']

X = data[features]
y = data['Log_Adj_Close']

# Escalamos datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

tscv = TimeSeriesSplit(n_splits=5)

# Introducimos el Modelo Ridge con ajuste de alpha
alphas = np.logspace(-4, 4, 10)
ridge = Ridge()
parameters = {'alpha': alphas}

# Búsqueda de los mejores hiperparámetros (ajuste de alpha)
ridge_cv = GridSearchCV(ridge, parameters, cv=tscv)
ridge_cv.fit(X_scaled, y)

# Imprimir el mejor valor de alpha encontrado
print(f'Mejor valor de alpha: {ridge_cv.best_params_["alpha"]}')

# Usar el mejor modelo con el mejor alpha
best_model = ridge_cv.best_estimator_

# Evaluación del modelo
for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')

# Imprimimos los coeficientes del modelo
coefficients = pd.DataFrame(best_model.coef_, X.columns, columns=['Coefficient'])
print(coefficients)

In [None]:
# Exportar el modelo en formato pkl, de forma que quede registrado

model_bytes = pickle.dumps(best_model)

# BlobServiceClient a partir de la cadena de conexión (la he quitado ahora)
connection_string = "DefaultEndpointsProtocol=https;AccountName=nticmasterstg;AccountKey=6aC6DUKYom+4R7dqYsuKKbsqN18Goyj6TlD5pOQIUTTxfyn61nduW9XSgAWeOpUvsEfKBYosnUZp+ASt8hU/sw==;EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Cliente del contenedor de Azure Blob Storage
container_name = "datalake"
container_client = blob_service_client.get_container_client(container_name)

blob_name = "/resources/apple_ridge_model.pkl" 
blob_client = container_client.get_blob_client(blob_name)

# Subo el archivo pickle serializado al Blob Storage
blob_client.upload_blob(model_bytes, overwrite=True)

print("Modelo subido exitosamente a Azure Blob Storage")
