In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import kagglehub 

path = kagglehub.dataset_download("mczielinski/bitcoin-historical-data")
print("Path to dataset files:", path)

os.listdir(path)


In [None]:
file_name = "btcusd_1-min_data.csv"  
file_path = os.path.join(path, file_name)

df_min = pd.read_csv(file_path)
df_min.head(), df_min.info()


In [None]:
df_min['timestamp'] = pd.to_datetime(df_min['Timestamp'], unit='s')
df_min = df_min.set_index('timestamp')

df_hour = df_min.resample('H').agg({
    'Open': 'first',
    'High': 'max',
    'Low': 'min',
    'Close': 'last',
    'Volume': 'sum'
}).dropna()

df_hour.head(), df_hour.info()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,4))
plt.plot(df_hour.index, df_hour['Close'])
plt.title("Hourly Bitcoin Close Price")
plt.xlabel("Time")
plt.ylabel("Price (USD)")
plt.show()


In [None]:
df = df_hour.copy()

df['return_1h'] = df['Close'].pct_change()

df['ma_5'] = df['Close'].rolling(5).mean()
df['ma_10'] = df['Close'].rolling(10).mean()
df['ma_20'] = df['Close'].rolling(20).mean()

df['volatility_10'] = df['return_1h'].rolling(10).std()

df['volume_10'] = df['Volume'].rolling(10).mean()

df['ma_ratio_5_20'] = df['ma_5'] / df['ma_20']

df['close_next'] = df['Close'].shift(-1)
df['target'] = (df['close_next'] > df['Close']).astype(int)

df = df.dropna()
df.head(), df.info()


In [None]:
feature_cols = [
    'return_1h',
    'ma_5',
    'ma_10',
    'ma_20',
    'volatility_10',
    'volume_10',
    'ma_ratio_5_20'
]

X = df[feature_cols]
y = df['target']

train_size = int(0.8 * len(df))
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

log_reg_clf = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

rf_clf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

log_reg_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

y_pred_lr = log_reg_clf.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate(name, y_true, y_pred):
    print("====", name, "====")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

evaluate("Logistic Regression", y_test, y_pred_lr)
evaluate("Random Forest", y_test, y_pred_rf)

baseline = max(y_test.mean(), 1-y_test.mean())
print("Baseline (majority guess):", baseline)


In [None]:
importances = rf_clf.feature_importances_
plt.figure(figsize=(6,4))
plt.barh(feature_cols, importances)
plt.title("Random Forest Feature Importance")
plt.show()


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_lr)
plt.title("Logistic Regression Confusion Matrix")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf)
plt.title("Random Forest Confusion Matrix")
plt.show()


In [None]:
rf_small = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    random_state=42
)

rf_small.fit(X_train, y_train)
y_pred_small = rf_small.predict(X_test)

evaluate("Random Forest (max_depth=5)", y_test, y_pred_small)


In [None]:
plt.figure(figsize=(8,4))
plt.hist(df['return_1h'], bins=50)
plt.title("Distribution of Hourly Returns")
plt.xlabel("Return (1h)")
plt.ylabel("Frequency")
plt.show()


In [None]:
import seaborn as sns

plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()
