In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, mean_squared_error, r2_score


In [2]:
# Cargar los archivos CSV
df = pd.read_csv('../data/result/combined_data.csv')


In [3]:
# Definir las características (X) y la variable objetivo (Y)
features = ['from', 'to', 'flightType', 'time', 'distance', 'agency', 'date_flight']
X = df[features]
Y = df['price_flight']

In [4]:
X = pd.get_dummies(X, columns=['from', 'to', 'flightType', 'agency', 'date_flight'])

In [5]:
# Escalar las características numéricas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
print(X.iloc[0].to_json())

{"time":1.76,"distance":676.53,"from_Aracaju (SE)":false,"from_Brasilia (DF)":false,"from_Campo Grande (MS)":false,"from_Florianopolis (SC)":false,"from_Natal (RN)":false,"from_Recife (PE)":true,"from_Rio de Janeiro (RJ)":false,"from_Salvador (BH)":false,"from_Sao Paulo (SP)":false,"to_Aracaju (SE)":false,"to_Brasilia (DF)":false,"to_Campo Grande (MS)":false,"to_Florianopolis (SC)":true,"to_Natal (RN)":false,"to_Recife (PE)":false,"to_Rio de Janeiro (RJ)":false,"to_Salvador (BH)":false,"to_Sao Paulo (SP)":false,"flightType_economic":false,"flightType_firstClass":true,"flightType_premium":false,"agency_CloudFy":false,"agency_FlyingDrops":true,"agency_Rainbow":false,"date_flight_01\/01\/2021":false,"date_flight_01\/01\/2022":false,"date_flight_01\/01\/2023":false,"date_flight_01\/02\/2020":false,"date_flight_01\/02\/2021":false,"date_flight_01\/02\/2022":false,"date_flight_01\/02\/2023":false,"date_flight_01\/03\/2020":false,"date_flight_01\/03\/2021":false,"date_flight_01\/03\/2022":fal

In [6]:
# Preparación de los datos (supongamos que X y y ya están definidos)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [7]:
# Árbol de Decisión
model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, Y_train)
Y_pred_tree = model_tree.predict(X_test)
print('Decision Tree Regression:')
print('RMSE:', np.sqrt(mean_squared_error(Y_test, Y_pred_tree)))
print('R^2:', r2_score(Y_test, Y_pred_tree))


Decision Tree Regression:
RMSE: 0.4903802361750227
R^2: 0.999998179175989


In [8]:
# Cross-Validation
scores = cross_val_score(model_tree, X, Y, cv=5)
print(f'Cross-Validation Scores: {scores}')
print(f'Mean Cross-Validation Score: {scores.mean()}')

Cross-Validation Scores: [0.9999825  0.99998995 1.         0.99996525 1.        ]
Mean Cross-Validation Score: 0.9999875411656044


In [10]:
import joblib

# Supongamos que 'model' es tu modelo entrenado
joblib.dump(model_tree, 'model.pkl')


['model.pkl']