In [80]:
import ccxt
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

# 初始化ccxt
exchange = ccxt.binance()

# 获取ETH的月度K线数据，时间范围为1个月（30天）
symbol = 'ETH/USDT'
timeframe = '1M'  # 每月的K线数据
limit = 1000  # 获取1000条月度K线数据（尽量多获取数据）

# 获取数据
ohlcv = exchange.fetch_ohlcv(symbol, timeframe, limit=limit)

# 转换为DataFrame
df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])

# 转换时间戳为日期格式
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# 提取月份特征
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year

print(df)

# 计算月度回报 (monthly_return)
df['monthly_return'] = (df['close'] - df['open']) / df['open']*100

# # 计算最大回撤 (max_drawdown) - 当前月的开盘到最低价格的回撤
# df['max_drawdown'] = (df['open'] - df['low']) / df['open']*100
df['max_drawdown'] = (df['close'] - df['open']) / df['open']*100
#max_drawdown should be negative
# df['max_drawdown'] = -df['max_drawdown']

# 目标值是下一个月的最大回撤，使用shift()将数据向前移动1步
df['next_month_max_drawdown'] = df['max_drawdown'].shift(-1)

# 删除包含NaN的行（由于最后一个月的目标是NaN）
df.dropna(subset=['next_month_max_drawdown'], inplace=True)

# 获取2024年11月的数据作为特征进行预测
november_data = df[df['month'] == 11].iloc[-1]  # 获取11月的数据（假设是最后一条记录）

# 删除2024年11月和12月的数据
# df = df[~((df['year'] == 2024) & (df['month'].isin([11, 12])))]

#Add a new feature, which is the number of months since the last halving,一次比特幣減半發生在2016年7月9日，接下來是第三次減半，發生在2020年5月11日。 最後一次減半發生在2024年4月19日
halving_dates = ['2016-07-09', '2020-05-11', '2024-04-19']
halving_dates = pd.to_datetime(halving_dates)

def months_since_last_halving(timestamp):
    # Find the most recent halving date
    last_halving_date = halving_dates[halving_dates <= timestamp].max()
    # Calculate the difference in months
    months_diff = (timestamp.year - last_halving_date.year) * 12 + (timestamp.month - last_halving_date.month)
    return months_diff

# Apply the function to each row in the DataFrame
df['months_since_last_halving'] = df['timestamp'].apply(months_since_last_halving)




    timestamp     open     high      low    close        volume  month  year
0  2017-08-01   301.13   393.71   144.21   384.79  8.265256e+04      8  2017
1  2017-09-01   386.44   394.39   192.00   304.36  1.679371e+05      9  2017
2  2017-10-01   305.13   354.00   272.20   304.90  2.311378e+05     10  2017
3  2017-11-01   304.89   515.00   274.73   427.43  5.581401e+05     11  2017
4  2017-12-01   428.05   864.90   375.01   733.98  1.709681e+06     12  2017
..        ...      ...      ...      ...      ...           ...    ...   ...
84 2024-08-01  3232.74  3242.57  2111.00  2513.01  1.252791e+07      8  2024
85 2024-09-01  2513.00  2728.60  2150.55  2602.23  7.928516e+06      9  2024
86 2024-10-01  2602.24  2769.48  2310.00  2518.61  8.867194e+06     10  2024
87 2024-11-01  2518.61  3738.98  2357.59  3703.60  1.836524e+07     11  2024
88 2024-12-01  3703.59  3746.80  3659.20  3721.99  3.394719e+05     12  2024

[89 rows x 8 columns]


In [81]:
df

Unnamed: 0,timestamp,open,high,low,close,volume,month,year,monthly_return,max_drawdown,next_month_max_drawdown,months_since_last_halving
0,2017-08-01,301.13,393.71,144.21,384.79,8.265256e+04,8,2017,27.782021,27.782021,-21.240037,13
1,2017-09-01,386.44,394.39,192.00,304.36,1.679371e+05,9,2017,-21.240037,-21.240037,-0.075378,14
2,2017-10-01,305.13,354.00,272.20,304.90,2.311378e+05,10,2017,-0.075378,-0.075378,40.191544,15
3,2017-11-01,304.89,515.00,274.73,427.43,5.581401e+05,11,2017,40.191544,40.191544,71.470623,16
4,2017-12-01,428.05,864.90,375.01,733.98,1.709681e+06,12,2017,71.470623,71.470623,53.450840,17
...,...,...,...,...,...,...,...,...,...,...,...,...
83,2024-07-01,3438.16,3562.82,2810.00,3232.74,8.935241e+06,7,2024,-5.974707,-5.974707,-22.263776,3
84,2024-08-01,3232.74,3242.57,2111.00,2513.01,1.252791e+07,8,2024,-22.263776,-22.263776,3.550736,4
85,2024-09-01,2513.00,2728.60,2150.55,2602.23,7.928516e+06,9,2024,3.550736,3.550736,-3.213770,5
86,2024-10-01,2602.24,2769.48,2310.00,2518.61,8.867194e+06,10,2024,-3.213770,-3.213770,47.049365,6


In [70]:
#show length of df 
print(len(df))



88


In [71]:
november_data

timestamp                  2024-11-01 00:00:00
open                                   2518.61
high                                   3738.98
low                                    2357.59
close                                   3703.6
volume                           18365244.9426
month                                       11
year                                      2024
monthly_return                       47.049365
max_drawdown                         -6.393209
next_month_max_drawdown              -1.198567
Name: 87, dtype: object

In [82]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline


# Features and target
X = df[['months_since_last_halving','month', 'volume', 'monthly_return']]
y = df['next_month_max_drawdown']

# Apply Polynomial Feature Transformation (Optional)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Train-test split (You can modify if you have more data)
X_train = X_poly
y_train = y
X_test = X_poly
y_test = y

# 1. Linear Regression (with Ridge)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)

# 2. Ridge Regression with Hyperparameter Tuning
ridge_model = Ridge()
ridge_params = {'alpha': [0.1, 1, 10, 100]}
ridge_grid_search = GridSearchCV(ridge_model, ridge_params, cv=5)
ridge_grid_search.fit(X_train, y_train)
best_ridge_model = ridge_grid_search.best_estimator_
y_pred_ridge = best_ridge_model.predict(X_test)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge, squared=False)

# 3. Random Forest Regressor with Hyperparameter Tuning
rf_model = RandomForestRegressor(random_state=42)
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(rf_model, rf_params, cv=5)
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)

# 4. XGBoost with Hyperparameter Tuning
xg_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xg_params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0],
    'alpha': [0, 0.1, 1]
}
xg_grid_search = GridSearchCV(xg_model, xg_params, cv=5)
xg_grid_search.fit(X_train, y_train)
best_xg_model = xg_grid_search.best_estimator_
y_pred_xg = best_xg_model.predict(X_test)
rmse_xg = mean_squared_error(y_test, y_pred_xg, squared=False)

# Print RMSE for each model
print(f"RMSE (Linear Regression): {rmse_linear}")
print(f"RMSE (Ridge Regression): {rmse_ridge}")
print(f"RMSE (Random Forest): {rmse_rf}")
print(f"RMSE (XGBoost): {rmse_xg}")



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

RMSE (Linear Regression): 24.61066269268227
RMSE (Ridge Regression): 24.634199632759977
RMSE (Random Forest): 20.202928252206803
RMSE (XGBoost): 21.785661468373494




In [84]:
# Now, use the best model to predict December's max drawdown (based on November's data)
november_features = {
    'months_since_last_halving': 7,
    'month': 11,
    'volume': 18365244.9426,  # Replace with actual data if available
    'monthly_return': 47.049365  # Replace with actual data if available
    
}

november_df = pd.DataFrame([november_features])
november_poly = poly.transform(november_df)

# Predict using each model
prediction_linear = linear_model.predict(november_poly)
prediction_ridge = best_ridge_model.predict(november_poly)
prediction_rf = best_rf_model.predict(november_poly)
prediction_xg = best_xg_model.predict(november_poly)

# Print the predictions
print(f"Prediction (Linear Regression): {prediction_linear[0]}")
print(f"Prediction (Ridge Regression): {prediction_ridge[0]}")
print(f"Prediction (Random Forest): {prediction_rf[0]}")
print(f"Prediction (XGBoost): {prediction_xg[0]}")


Prediction (Linear Regression): 37.022839496084764
Prediction (Ridge Regression): 37.24538274844899
Prediction (Random Forest): 21.511876271467482
Prediction (XGBoost): 22.0854434967041


In [1]:
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Define features (X) and target (y)
X = df[['month', 'volume', 'monthly_return']]
y = df['next_month_max_drawdown']

# Time series split (cross-validation)
tscv = TimeSeriesSplit(n_splits=5)

# Hyperparameters for LightGBM
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 3,  # Smaller number of leaves for small data
    'learning_rate': 0.1,  # Higher learning rate
    'feature_fraction': 0.8,  # Keep this moderate
    'max_depth': 2,  # Very shallow trees
    'min_data_in_leaf': 10,  # More data per leaf to avoid overfitting
    'lambda_l1': 0.1,  # L1 regularization to prevent overfitting
    'lambda_l2': 0.1   # L2 regularization to prevent overfitting
}



# Training using time series cross-validation
rmse_scores = []

best_bst = None  # Store the best model

for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
    
    # Train the model without early stopping
    bst = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[test_data])
    
    # Track the best model based on the validation RMSE
    if best_bst is None or bst.best_score['valid_0']['rmse'] < best_bst.best_score['valid_0']['rmse']:
        best_bst = bst
    
    # Predict and evaluate the current fold
    y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    rmse_scores.append(rmse)

# Average RMSE across folds
average_rmse = np.mean(rmse_scores)
print(f"Average RMSE (Cross-validation): {average_rmse}")


# Now, use the model to predict December's max drawdown (based on November's data)
november_features = {
    'month': 11,
    'volume': 18365244.9426,  # Replace with actual data if available
    'monthly_return': 47.049365  # Replace with actual data if available
}

november_features_df = pd.DataFrame([november_features])

# Predict December max drawdown
predicted_max_drawdown_december = bst.predict(november_features_df)
print(f"Predicted max drawdown for December 2024: {predicted_max_drawdown_december[0]}")


NameError: name 'df' is not defined

In [16]:
df


Unnamed: 0,timestamp,open,high,low,close,volume,month,year,monthly_return,max_drawdown,next_month_max_drawdown
0,2017-08-01,301.13,393.71,144.21,384.79,8.265256e+04,8,2017,0.277820,0.521104,0.503157
1,2017-09-01,386.44,394.39,192.00,304.36,1.679371e+05,9,2017,-0.212400,0.503157,0.107921
2,2017-10-01,305.13,354.00,272.20,304.90,2.311378e+05,10,2017,-0.000754,0.107921,0.098921
3,2017-11-01,304.89,515.00,274.73,427.43,5.581401e+05,11,2017,0.401915,0.098921,0.123911
4,2017-12-01,428.05,864.90,375.01,733.98,1.709681e+06,12,2017,0.714706,0.123911,0.022114
...,...,...,...,...,...,...,...,...,...,...,...
81,2024-05-01,3014.04,3977.00,2817.00,3762.29,1.159904e+07,5,2024,0.248255,0.065374,0.138822
82,2024-06-01,3762.29,3887.47,3240.00,3438.16,7.615637e+06,6,2024,-0.086152,0.138822,0.182702
83,2024-07-01,3438.16,3562.82,2810.00,3232.74,8.935241e+06,7,2024,-0.059747,0.182702,0.346994
84,2024-08-01,3232.74,3242.57,2111.00,2513.01,1.252791e+07,8,2024,-0.222638,0.346994,0.144230
