In [None]:
import pandas as pd
import sklearn.model_selection as ms
import sklearn.metrics as me
import xgboost as xgb

In [None]:
RANDOM_STATE = 42

In [None]:
ranked = pd.read_csv("../files/flights_subspace_rank.csv")
ranked_test = pd.read_csv("../files/flights_test_rank.csv")
stats = pd.read_csv("../files/flights_subspace_stats.csv")
stats_test = pd.read_csv("../files/flights_test_stats.csv")

In [None]:
# data sampling
rank_sample = ranked.sample(frac=0.15, random_state=RANDOM_STATE)
stats_sample = stats.sample(frac=0.15, random_state=RANDOM_STATE)

In [None]:
rfc = xgb.XGBRFClassifier(use_label_encoder=False, random_state=RANDOM_STATE)

In [None]:
dfs = (ranked, stats)

# binarize the arr_delay feature and drop it.
for df in dfs:
    df['is_delayed'] = df['arr_delay'].apply(lambda delay: 1 if (delay > 0) else 0)

In [None]:
X_rank, y_rank = ranked.drop(columns=['fl_date', 'arr_delay', 'is_delayed']), ranked.is_delayed
X_stats, y_stats = stats.drop(columns=['fl_date', 'arr_delay', 'is_delayed']), stats.is_delayed

In [None]:
rank_Xtrain, rank_Xtest, rank_ytrain, rank_ytest = ms.train_test_split(X_rank, y_rank, stratify=y_rank, test_size=0.3, random_state=RANDOM_STATE)
stats_Xtrain, stats_Xtest, stats_ytrain, stats_ytest = ms.train_test_split(X_stats, y_stats, stratify=y_stats, test_size=0.3, random_state=RANDOM_STATE)

### Ranked data

In [None]:
rfc.fit(rank_Xtrain, rank_ytrain)
rank_pred = rfc.predict(rank_Xtest)

In [None]:
soft_from_rank = rfc.predict_proba(rank_Xtest)

In [None]:
print(me.classification_report(rank_ytest, rank_pred))

In [None]:
me.ConfusionMatrixDisplay.from_predictions(rank_ytest, rank_pred)

In [None]:
me.RocCurveDisplay.from_predictions(rank_ytest, rank_pred)

### Stats data

In [None]:
rfc.fit(stats_Xtrain, stats_ytrain)
stats_pred = rfc.predict(stats_Xtest)

In [None]:
soft_from_stats = rfc.predict_proba(stats_Xtest)

In [None]:
print(me.classification_report(stats_ytest, stats_pred))

In [None]:
me.ConfusionMatrixDisplay.from_predictions(stats_ytest, stats_pred)

In [None]:
me.RocCurveDisplay.from_predictions(stats_ytest, stats_pred)

### Predictions

In [None]:
rank = soft_from_rank[:, :1]
stats = soft_from_stats[:, :1]