In [0]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings("ignore")


url = 'https://github.com/anfisbena/MIAD-DSA/raw/refs/heads/main/data/RTOP.csv'
df = pd.read_csv(url, encoding='ISO-8859-1')
df = df.rename(columns={'Ship ID - Ship Classification': 'Vessel Type', 'Ship - Name': 'Ship Name'})

def clean_df(df):
    df_clean = df[['Ship Name', 'Vessel Type', 'Country', 'Location', 'Arrival', 'Departure', 'Status']]
    df_clean = df_clean.dropna(subset=['Arrival', 'Departure', 'Ship Name', 'Location', 'Vessel Type', 'Status'])
    df_clean = df_clean[df_clean['Status'] != 'Cancelled']
    # Convertir las columnas 'Arrival' y 'Departure' a datetime
    df_clean['Arrival'] = pd.to_datetime(df_clean['Arrival'], errors='coerce')
    df_clean['Departure'] = pd.to_datetime(df_clean['Departure'], errors='coerce')
    df_clean['Duration'] = (df_clean['Departure'] - df_clean['Arrival']).dt.days
    # Filtrar las filas donde 'Duration' sea menor o igual a 60 días
    df_clean = df_clean[(df_clean['Duration'] <= 60) & (df_clean['Duration'] >= 0)]
    return df_clean

def predict_next_location(df_c, ship_name='Trenton'):
    df = df_c[df_c['Ship Name'] == ship_name].copy()

    # Convertir columnas de fecha a formato datetime
    df['Arrival'] = pd.to_datetime(df['Arrival'])
    df['Departure'] = pd.to_datetime(df['Departure'])

    # Convertir columnas de fechas y agregar variables temporales
    df['Arrival_ordinal'] = df['Arrival'].map(pd.Timestamp.toordinal)
    df['Departure_ordinal'] = df['Departure'].map(pd.Timestamp.toordinal)
    df['Arrival_Month'] = df['Arrival'].dt.month
    df['Arrival_Year'] = df['Arrival'].dt.year
    df['Arrival_DayOfWeek'] = df['Arrival'].dt.dayofweek

    # Codificar características categóricas
    label_encoders = {}
    for col in ['Ship Name', 'Country', 'Location']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Definir características (X) y variables objetivo (y)
    X = df[['Country', 'Arrival_ordinal', 'Departure_ordinal', 'Arrival_Month', 'Arrival_Year', 'Arrival_DayOfWeek']]
    y_location = df['Location']

    # Dividir los datos en conjunto de entrenamiento y prueba
    X_train, X_test, y_train_location, y_test_location = train_test_split(X, y_location, test_size=0.10, random_state=42)

    # Crear el modelo de regresión logística
    model_location = LogisticRegression()

    # Definir los hiperparámetros a optimizar
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'saga']
    }

    # Configurar la validación cruzada con GridSearchCV
    grid_search = GridSearchCV(model_location, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train_location)

    # Obtener el mejor modelo
    best_model = grid_search.best_estimator_

    # Evaluar el modelo con los datos de prueba
    y_pred = best_model.predict(X_test)
    location_accuracy = accuracy_score(y_test_location, y_pred)
    location_precision = precision_score(y_test_location, y_pred, average='weighted')
    location_recall = recall_score(y_test_location, y_pred, average='weighted')
    location_f1 = f1_score(y_test_location, y_pred, average='weighted')

    # Predecir la próxima ubicación del barco
    next_data = X_test.iloc[0].values.reshape(1, -1)
    predicted_location = best_model.predict(next_data)

    # Decodificar Location
    predicted_location = label_encoders['Location'].inverse_transform(predicted_location)

    print(f"Precisión de Ubicación: {location_accuracy * 100:.2f}%")
    print(f"Precisión (Weighted): {location_precision:.2f}")
    print(f"Recall (Weighted): {location_recall:.2f}")
    print(f"F1 Score (Weighted): {location_f1:.2f}")

    return predicted_location[0], location_accuracy, best_model
#____________________________________________________________________________________________________
# Uso de la función

experiment = mlflow.set_experiment("/Users/republicacoc@gmail.com/RandClass")
predicted_location, location_accuracy,model = predict_next_location(clean_df(df))
print(f"Próxima ubicación del barco: {predicted_location}")
with mlflow.start_run(experiment_id=experiment.experiment_id):
    # Registre los parámetros
    mlflow.log_param("predicted location", predicted_location)
    # Registre el modelo
    mlflow.sklearn.log_model(model, "logistic-regression-model")

    # Cree y registre la métrica de interés
    mlflow.log_metric("accuracy", location_accuracy)


Uploading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Precisión de Ubicación: 10.00%
Precisión (Weighted): 0.01
Recall (Weighted): 0.10
F1 Score (Weighted): 0.02
Próxima ubicación del barco: Souda Bay




Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

2024/11/11 02:56:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run abrasive-turtle-980 at: https://community.cloud.databricks.com/ml/experiments/3923292881806332/runs/da62024da2ef4d5fae755bb899caa416.
2024/11/11 02:56:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://community.cloud.databricks.com/ml/experiments/3923292881806332.
