In [74]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score


In [76]:
train_df = pd.read_csv("Desktop/directional-forecasting-in-cryptocurrencies/train.csv")
test_df = pd.read_csv("Desktop/directional-forecasting-in-cryptocurrencies/test.csv")

train_df['datetime'] = pd.to_datetime(train_df['timestamp'], unit='s')
test_df['datetime'] = pd.to_datetime(test_df['timestamp'], unit='s')

###
train_df['year'] = train_df['datetime'].dt.year
train_df['month'] = train_df['datetime'].dt.month
train_df['day'] = train_df['datetime'].dt.day
train_df['day_of_week'] = train_df['datetime'].dt.dayofweek  # Monday=0, Sunday=6
train_df['hour'] = train_df['datetime'].dt.hour
train_df['minute'] = train_df['datetime'].dt.minute
train_df['minute_of_day'] = train_df['hour'] * 60 + train_df['minute']
###

###
test_df['year'] = test_df['datetime'].dt.year
test_df['month'] = test_df['datetime'].dt.month
test_df['day'] = test_df['datetime'].dt.day
test_df['day_of_week'] = test_df['datetime'].dt.dayofweek 
test_df['hour'] = test_df['datetime'].dt.hour
test_df['minute'] = test_df['datetime'].dt.minute
test_df['minute_of_day'] = test_df['hour'] * 60 + test_df['minute']
###

train_df.drop(columns=['timestamp', 'datetime'], inplace=True)
test_df.drop(columns=['timestamp', 'datetime'], inplace=True)

X_train = train_df.drop(columns=['target'])
y_train = train_df['target']

X_test = test_df.drop(columns=['row_id'])
test_row_ids = test_df['row_id']




In [41]:
# Define and initialize XGBClassifier with a parameter grid
xgb_model = XGBClassifier(use_label_encoder= True)
param_grid = {
    'n_estimators': [2500],         # similar to `iterations` in CatBoost
    'learning_rate': [0.25],        # similar to `learning_rate` in CatBoost
    'max_depth': [12],               # similar to `depth` in CatBoost
    'reg_lambda': [20],              # similar to `l2_leaf_reg` in CatBoost
    'subsample': [0.85]              # same as `subsample` in CatBoost
}

# Set up GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=2,
    n_jobs=-1
)

# Fit model on training data
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'learning_rate': 0.25, 'max_depth': 12, 'n_estimators': 2500, 'reg_lambda': 20, 'subsample': 0.85}


In [61]:
import pandas as pd

# Assuming best_model has a feature_importances_ attribute (like CatBoost, RandomForest, etc.)
# and X_train is a DataFrame
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.feature_importances_
})

# Sort by importance (descending)
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Print feature importances
print(feature_importances)


                   Feature  Importance
3                    close    0.083017
2                      low    0.075296
1                     high    0.073320
15           minute_of_day    0.067346
8   taker_buy_quote_volume    0.067204
5       quote_asset_volume    0.066148
9                     year    0.063571
4                   volume    0.060896
10                   month    0.060677
7    taker_buy_base_volume    0.060549
6         number_of_trades    0.060388
11                     day    0.057999
14                  minute    0.055726
12             day_of_week    0.055725
0                     open    0.050008
13                    hour    0.042131


In [110]:
params = {'learning_rate': 0.6, 'max_depth': 8, 'n_estimators': 1800, 'reg_lambda': 1, 'reg_alpha': 0.5, 'min_child_weight': 1, 'objective': 'binary:logistic', 'subsample': 0.9}
best_model = XGBClassifier(**params)
best_model = best_model.fit(X_train, y_train)

In [111]:
y_pred = best_model.predict(X_train)
print(f1_score(y_pred, y_train))

0.7876047743306452


In [114]:
# Predict on test data
y_test_pred = best_model.predict(X_test)

In [115]:
# Create a DataFrame for submission
submission = pd.DataFrame({
    'row_id': test_row_ids,
    'predictions': y_test_pred
})

# Save submission as CSV
submission.to_csv('submission_xgb.csv', index=False)
print("Submission saved as 'submission_xgb.csv'.")

Submission saved as 'submission_xgb.csv'.
