In [7]:
# Instalação das bibliotecas necessárias
!pip install pandas numpy scikit-learn xgboost category_encoders

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from category_encoders import TargetEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
from google.colab import drive


# Carregar os dados
caminho_arquivo = '/content/drive/MyDrive/ProjetoIA/voos_filtrados.csv'
df = pd.read_csv(caminho_arquivo)

# Converter datas
df['searchDate'] = pd.to_datetime(df['searchDate'])
df['flightDate'] = pd.to_datetime(df['flightDate'])

# Feature Engineering
def prepare_features(df):
    # Features temporais
    df['dias_ate_voo'] = (df['flightDate'] - df['searchDate']).dt.days
    df['mes_voo'] = df['flightDate'].dt.month
    df['dia_semana_voo'] = df['flightDate'].dt.dayofweek
    df['mes_pesquisa'] = df['searchDate'].dt.month
    df['dia_semana_pesquisa'] = df['searchDate'].dt.dayofweek

    # Codificar variáveis categóricas
    target_encoder = TargetEncoder()
    df['startingAirport_encoded'] = target_encoder.fit_transform(df['startingAirport'], df['totalFare'])
    df['destinationAirport_encoded'] = target_encoder.fit_transform(df['destinationAirport'], df['totalFare'])
    df['airline_encoded'] = target_encoder.fit_transform(df['segmentsAirlineName'], df['totalFare'])

    # Selecionar features para o modelo
    features = [
        'dias_ate_voo',
        'mes_voo',
        'dia_semana_voo',
        'mes_pesquisa',
        'dia_semana_pesquisa',
        'startingAirport_encoded',
        'destinationAirport_encoded',
        'airline_encoded'
    ]

    return df, features

# Preparar dados
df, features = prepare_features(df)

# Separar features (X) e target (y)
X = df[features]
y = df['totalFare']

# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Normalizar as features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Converter para DataFrame mantendo os nomes das colunas
X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

print("Forma dos dados de treino:", X_train_scaled.shape)
print("Forma dos dados de teste:", X_test_scaled.shape)

# Salvar os dados preparados para uso posterior
np.save('/content/drive/MyDrive/ProjetoIA/X_train.npy', X_train_scaled)
np.save('/content/drive/MyDrive/ProjetoIA/X_test.npy', X_test_scaled)
np.save('/content/drive/MyDrive/ProjetoIA/y_train.npy', y_train)
np.save('/content/drive/MyDrive/ProjetoIA/y_test.npy', y_test)

Forma dos dados de treino: (17653510, 8)
Forma dos dados de teste: (4413378, 8)
