In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [2]:
train = pd.read_csv('/kaggle/input/flight-delays-fall-2018/flight_delays_train.csv.zip', compression='zip')
test = pd.read_csv('/kaggle/input/flight-delays-fall-2018/flight_delays_test.csv.zip', compression='zip')

train['dep_delayed_15min'] = train['dep_delayed_15min'].map({"N" : 0, "Y" : 1})
y = train['dep_delayed_15min']
train = train.drop('dep_delayed_15min', axis = 1)

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Month          100000 non-null  object
 1   DayofMonth     100000 non-null  object
 2   DayOfWeek      100000 non-null  object
 3   DepTime        100000 non-null  int64 
 4   UniqueCarrier  100000 non-null  object
 5   Origin         100000 non-null  object
 6   Dest           100000 non-null  object
 7   Distance       100000 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 6.1+ MB


In [4]:
set(train) - set(test)

set()

In [5]:
def data_preparation(train):

    #train['destenation'] = train['Origin'] + "--" + train['Dest']
    train = train.drop(['Origin', 'Dest'], axis = 1)
    
    train[['Month', 'DayofMonth', 'DayOfWeek']] = train[['Month', 'DayofMonth', 'DayOfWeek']].applymap(lambda row: row.strip('c-'))

    train['is_weekend'] = (train['DayOfWeek'] == '6') | (train['DayOfWeek'] == '7')

    train['summer'] = (train['Month'].astype('int64').isin([6, 7, 8]))
    train['autumn'] = (train['Month'].astype('int64').isin([9, 10, 11]))
    train['winter'] = (train['Month'].astype('int64').isin([12, 1, 2]))
    train['spring'] = (train['Month'].astype('int64').isin([3, 4, 5]))

    def time_of_day(hour):
        if 5 <= hour < 10:
            return 'Early Morning'
        elif 10 <= hour < 15:
            return 'Late Morning'
        elif 15 <= hour < 20:
            return 'Afternoon'
        else:
            return 'Evening/Night'
    
    train['TimeOfDay'] = (train['DepTime'] // 100).apply(time_of_day).astype('object')

    cat_feat_idx = [train.columns.get_loc(col) for col in train.select_dtypes(include='object').columns]    
    
    return train, cat_feat_idx

In [6]:
train_data, cat_feat_idx = data_preparation(train)

  train[['Month', 'DayofMonth', 'DayOfWeek']] = train[['Month', 'DayofMonth', 'DayOfWeek']].applymap(lambda row: row.strip('c-'))


In [7]:
X = train_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [8]:
cat_feat_idx

[0, 1, 2, 4, 11]

In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

model = CatBoostClassifier(cat_features=cat_feat_idx, eval_metric='AUC', verbose=0)

# Кросс-валидация с расчетом AUC
cross_valid_scores = cross_val_score(
    estimator=model,
    X=X,
    y=y,
    cv=kf,
    scoring='roc_auc'  # Используем встроенную метрику roc_auc
)

print(f'Cross-Validation AUC scores: {cross_valid_scores}')
print(f'Mean AUC: {cross_valid_scores.mean():.4f}')


model.fit(X_train, y_train)

y_pred = model.predict_proba(X_test)[:, 1]


# Вычисляем ROC-кривую
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Вычисляем AUC
roc_auc = auc(fpr, tpr)

print(f'ROC AUC on test set: {roc_auc:.4f}')

Cross-Validation AUC scores: [0.74802956 0.74112746 0.74566243 0.74789476 0.74300882]
Mean AUC: 0.7451
ROC AUC on test set: 0.7455


In [10]:
test_idx = test.index
test_data, _ = data_preparation(test)

y_pred_test = model.predict_proba(test_data)[:, 1] 

submission = pd.DataFrame({
    'id' : test_idx,
    'dep_delayed_15min' : y_pred_test
})

submission.to_csv('submission.csv', index=False)

  train[['Month', 'DayofMonth', 'DayOfWeek']] = train[['Month', 'DayofMonth', 'DayOfWeek']].applymap(lambda row: row.strip('c-'))


In [11]:
submission.head()

Unnamed: 0,id,dep_delayed_15min
0,0,0.042609
1,1,0.035701
2,2,0.043435
3,3,0.218866
4,4,0.206283
