In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [21]:
# Carregando dados
airport_data = pd.read_csv('./../data/airports-database.csv')

In [22]:
# Criando colunas pertinentes (ações já feitas na parte de exploração dos dados)
airport_data['cancelled'] = ((airport_data['dep_time'].isna()) & (airport_data['arr_time'].isna())).astype(int)
airport_data['diverted'] = ((airport_data['cancelled'] == 0) & (airport_data['arr_time'].isna())).astype(int)
airport_data['landed'] = ((airport_data['dep_time'].notna()) & (airport_data['arr_time'].notna())).astype(int)

airport_data = airport_data[airport_data['landed'] == 1]

In [23]:
airport_data.head().T

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
year,2013,2013,2013,2013,2013
month,1,1,1,1,1
day,1,1,1,1,1
dep_time,517.0,533.0,542.0,544.0,554.0
sched_dep_time,515,529,540,545,600
dep_delay,2.0,4.0,2.0,-1.0,-6.0
arr_time,830.0,850.0,923.0,1004.0,812.0
sched_arr_time,819,830,850,1022,837
arr_delay,11.0,20.0,33.0,-18.0,-25.0


In [13]:
airport_data = airport_data.dropna()
airport_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327346 entries, 0 to 336769
Data columns (total 24 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              327346 non-null  int64  
 1   year            327346 non-null  int64  
 2   month           327346 non-null  int64  
 3   day             327346 non-null  int64  
 4   dep_time        327346 non-null  float64
 5   sched_dep_time  327346 non-null  int64  
 6   dep_delay       327346 non-null  float64
 7   arr_time        327346 non-null  float64
 8   sched_arr_time  327346 non-null  int64  
 9   arr_delay       327346 non-null  float64
 10  carrier         327346 non-null  object 
 11  flight          327346 non-null  object 
 12  tailnum         327346 non-null  object 
 13  origin          327346 non-null  object 
 14  dest            327346 non-null  object 
 15  air_time        327346 non-null  float64
 16  distance        327346 non-null  int64  
 17  hour           

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [9]:
# Removendo algumas variáveis que não fazem muito sentido, criando X e y
X = airport_data[['dep_time','dep_delay', 'origin', 'dest', 'carrier', 'distance', 'month']]
y = airport_data['arr_delay']  # Lidar com NaN

# Dividindo o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separando variáveis categóricas e numéricas
categorical_features = ['carrier', 'origin', 'dest', 'month']
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remover colunas categóricas das features numéricas
numeric_features = [col for col in numeric_features if col not in categorical_features]

# Fazendo encoding das variáveis categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Modelo de Regressão com Ridge (L2 Regularização)
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])  # alpha controla a regularização, quanto maior o alpha, mais forte a penalização

# Treinando o modelo
model.fit(X_train, y_train)

# Prevendo no conjunto de teste
y_pred = model.predict(X_test)

mse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE:: {mse:.4f}')

RMSE:: 17.5413


In [24]:
import pickle

with open('modelo.pkl', 'wb') as f:
    pickle.dump(model, f)