# DRILLING IA TRAINING

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import classification_report
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import sys
import os

# Add the project root directory to the Python path
sys.path.append(os.path.abspath('..'))

from src.features import feature_engineering
from src.reporting import generate_impact_report, plot_results

warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

# 1. LOAD DATA 
try:
    df = pd.read_csv('../Data/drilling_data_stuckpipe.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    print(" Data loaded.")
except FileNotFoundError:
    print(" Error: File not found.")
    exit()

# 2. FEATURE ENGINEERING 
df_model = feature_engineering(df)

# 3. MODEL TRAINING (GRID SEARCH) 
features = ['ROP_m_hr', 'Torque_ft_lb', 'WOB_klbs', 'RPM', 'SPP_PSI', 'MSE_PSI', 
            'Torque_Trend', 'ROP_Trend', 'SPP_Trend', 'Friction_Factor']

target = df_model['Phase_Status'].apply(lambda x: 1 if x in ['Alerta', 'Critica', 'Stuck'] else 0)

# Div Train/Test 
X = df_model[features]
y = target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Conf GridSearch
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

model = RandomForestClassifier(random_state=42)
tscv = TimeSeriesSplit(n_splits=3) 

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=tscv, scoring='f1_weighted', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Save models to the Models directory
model_path = '../Models/stuckpipe_brain.joblib'
config_path = '../Models/Conf_IA.joblib'
os.makedirs('../Models', exist_ok=True)

joblib.dump(best_model, model_path)
print(" Model saved successfully.")

# Saving
configuracion = {
    'umbral_corte': 0.75,
    'columnas_entrenamiento': features
}
joblib.dump(configuracion, config_path)

# 4. APPLY MODEL & PREDICT 

probs_totales = best_model.predict_proba(df_model[features])[:, 1]
df_model['Risk_Probability'] = probs_totales 

# 5. ECONOMIC IMPACT REPORT 
ai_date, real_date = generate_impact_report(df_model)

# 6. VISUALIZATION 
if ai_date and real_date:
    plot_results(df_model, ai_date, real_date)