In [5]:
import numpy as np
import pandas as pd
from datetime import datetime

from typing import Tuple, Union, List

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

class DelayModel:

    def __init__(
        self
    ):
        self._model = xgb.XGBClassifier(random_state=1, learning_rate=0.01) # Model should be saved in this attribute.

    def preprocess(
        self,
        data: pd.DataFrame,
        target_column: str = None
    ) -> Union[Tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]:
        """
        Prepare raw data for training or predict.

        Args:
            data (pd.DataFrame): raw data.
            target_column (str, optional): if set, the target is returned.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: features and target.
            or
            pd.DataFrame: features.
        """
      
        def get_min_diff(data):
            fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
            fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
            min_diff = ((fecha_o - fecha_i).total_seconds())/60
            return min_diff

        data['min_diff'] = data.apply(get_min_diff, axis = 1)
        threshold_in_minutes = 15  
        data['delay'] = np.where(data['min_diff'] > threshold_in_minutes, 1, 0)     
        
        
        training_data = shuffle(data[['OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'delay']], random_state = 111)

        features = pd.concat([
        pd.get_dummies(data['OPERA'], prefix = 'OPERA'),
        pd.get_dummies(data['TIPOVUELO'], prefix = 'TIPOVUELO'), 
        pd.get_dummies(data['MES'], prefix = 'MES')], 
        axis = 1
        )

        target = data['delay']

        x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.33, random_state = 42)

        
        return x_train, x_test, y_train, y_test

    def fit(
        self,
        features: pd.DataFrame,
        target: pd.DataFrame
    ) -> None:
        """
        Fit model with preprocessed data.

        Args:
            features (pd.DataFrame): preprocessed data.
            target (pd.DataFrame): target.
        """
        self._model.fit(x_train,y_train)
        return

    def predict(
        self,
        features: pd.DataFrame
    ) -> List[int]:
        """
        Predict delays for new flights.

        Args:
            features (pd.DataFrame): preprocessed data.
        
        Returns:
            (List[int]): predicted targets.
        """
        predictions = self._model.predict(x_test)
        return predictions



In [16]:
# Cargar tus datos en un DataFrame de pandas
data = pd.read_csv('../data/data.csv', low_memory=False)


# Crear una instancia de la clase DelayModel
model = DelayModel()

# Preprocesar los datos
x_train, x_test, y_train, y_test = model.preprocess(data)

# Entrenar el modelo
model.fit(x_train, y_train)

# Realizar predicciones
#new_data = pd.read_csv("data/data.csv")  # Reemplaza con tus nuevos datos
predictions = model.predict(x_test)
print(predictions)

array([0, 0, 0, ..., 0, 0, 0])