In [None]:
# --- Final Code Modification: 1-Year Data, 1-Day Prediction, Tuned XGBoost (Google Trends Feature Added) ---
import yfinance as yf
import pandas as pd
import numpy as np
import pandas_ta as ta
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pytrends.request import TrendReq

# === Step 1: Data Collection (Using yfinance, 1 Year) ===
ticker = "BTC-USD"
# print(f"Starting data collection for {ticker} (Last 1 Year)...") # Removed
end_date = datetime.now()
start_date = end_date - timedelta(days=365) # 1 Year
start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')

try:
    btc_data = yf.download(ticker, start=start_date_str, end=end_date_str, progress=False)
    if btc_data.empty: raise ValueError(f"No data downloaded for {ticker}")
    # print(f"Downloaded {len(btc_data)} rows for {ticker}") # Removed

    # --- Column Name Handling ---
    # print(f"Original columns type: {type(btc_data.columns)}") # Removed
    if isinstance(btc_data.columns, pd.MultiIndex):
        # print("Detected MultiIndex columns. Flattening...") # Removed
        new_cols = [];
        for col_tuple in btc_data.columns.values:
            standard_name = None
            for level in col_tuple:
                if isinstance(level, str) and level.capitalize() in ['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']:
                    standard_name = level.capitalize(); break
            if standard_name: new_cols.append(standard_name)
            else: new_cols.append('_'.join(filter(None, map(str, col_tuple))).strip())
        btc_data.columns = new_cols
        if 'Adj Close' in btc_data.columns and 'Close' in btc_data.columns: btc_data.drop(columns=['Adj Close'], inplace=True, errors='ignore')
        elif 'Adj Close' in btc_data.columns and 'Close' not in btc_data.columns: btc_data.rename(columns={'Adj Close': 'Close'}, inplace=True)
    elif all(isinstance(c, str) for c in btc_data.columns):
        # print("Detected single-level string columns. Capitalizing...") # Removed
        btc_data.columns = [col.capitalize() for col in btc_data.columns]
        adj_close_variations = ['Adj_close', 'Adj close']
        adj_close_col_found = next((var for var in adj_close_variations if var in btc_data.columns), None)
        if adj_close_col_found:
            if 'Close' not in btc_data.columns: btc_data.rename(columns={adj_close_col_found: 'Close'}, inplace=True)
            else: btc_data.drop(columns=[adj_close_col_found], inplace=True, errors='ignore')
    else: print("Warning: Unexpected column format detected.") # Kept Warning
    # print(f"Processed columns: {btc_data.columns.tolist()}") # Removed

except Exception as e:
    print(f"Error during data download or initial column processing: {e}") # Kept Error
    raise SystemExit(f"Stopping due to data error: {e}")


# === Step 1.5: Fetch Google Trends Data ===
# print("\nFetching Google Trends data for 'Bitcoin'...") # Removed
try:
    pytrends = TrendReq(hl='en-US', tz=360) # Initialize
    kw_list = ["Bitcoin"] # Keyword to track

    # Define timeframe matching the price data
    timeframe = f'{start_date_str} {end_date_str}'

    pytrends.build_payload(kw_list, cat=0, timeframe=timeframe, geo='', gprop='')
    trends_df = pytrends.interest_over_time()

    if trends_df.empty:
        print("Warning: Google Trends returned no data for the specified timeframe.") # Kept Warning
    else:
        # Google Trends might return weekly data for > ~9 months. Resample to daily.
        if pd.infer_freq(trends_df.index) == 'W-SUN' or pd.infer_freq(trends_df.index) is None: # Check if weekly or unknown freq
             # print("Google Trends data appears weekly, resampling to daily and forward-filling...") # Removed
             # Keep only the keyword column, drop 'isPartial'
             trends_df = trends_df[[kw_list[0]]]
             # Resample to daily frequency, forward fill missing days
             trends_df = trends_df.resample('D').ffill()
        elif 'isPartial' in trends_df.columns:
             trends_df = trends_df.drop(columns=['isPartial']) # Drop 'isPartial' if daily

        # Rename column for clarity
        trends_df.rename(columns={"Bitcoin": "Bitcoin_Trend"}, inplace=True)
        # print("Google Trends data fetched and processed.") # Removed
        # print(trends_df.head())
        # print(trends_df.tail())

        # Merge with btc_data (use left merge to keep all btc_data rows)
        btc_data = btc_data.merge(trends_df, left_index=True, right_index=True, how='left')
        # Forward fill any potential gaps introduced by merging (e.g., weekends in trend data)
        if 'Bitcoin_Trend' in btc_data.columns:
             btc_data['Bitcoin_Trend'].ffill(inplace=True)
             # Still might have NaNs at the very beginning if trend data starts later
             # These will be handled by the main dropna later
             # print("Merged Google Trends data into main DataFrame.") # Removed
        else:
             print("Warning: 'Bitcoin_Trend' column not found after processing trends data.") # Kept Warning

except Exception as e:
    print(f"Error fetching or processing Google Trends data: {e}") # Kept Error
    print("Proceeding without Google Trends feature.") # Kept Status


# === Step 2: Calculate Financial Features ===
# print("\nCalculating financial features...") # Removed
required_cols_for_ta = ['Close', 'High', 'Low', 'Volume']
if not all(col in btc_data.columns for col in required_cols_for_ta):
     missing = [col for col in required_cols_for_ta if col not in btc_data.columns]
     raise KeyError(f"Required columns {missing} not found.")
try:
    btc_data['Daily_Return'] = btc_data['Close'].pct_change() * 100
    btc_data['High_Low_Range'] = btc_data['High'] - btc_data['Low']
    btc_data.ta.rsi(close='Close', length=14, append=True)
    btc_data.ta.macd(close='Close', fast=12, slow=26, signal=9, append=True)
    btc_data.ta.bbands(close='Close', length=20, std=2, append=True)
    btc_data.ta.stoch(high='High', low='Low', close='Close', k=14, d=3, smooth_k=3, append=True)
    btc_data.ta.obv(close='Close', volume='Volume', append=True)
    btc_data.ta.adx(high='High', low='Low', close='Close', length=14, append=True)
    btc_data.ta.ema(close='Close', length=5, append=True)
    btc_data.ta.ema(close='Close', length=20, append=True)
    btc_data.ta.atr(high='High', low='Low', close='Close', length=14, append=True)
    log_return = np.log(btc_data['Close'] / btc_data['Close'].shift(1))
    btc_data['Hist_Vol_30'] = log_return.rolling(window=30).std() * np.sqrt(365) * 100
    rsi_col = next((col for col in btc_data.columns if 'RSI_14' in col), None)
    if rsi_col is None: raise KeyError("RSI column not found.")
    btc_data['RSI_gradient'] = (btc_data[rsi_col] - btc_data[rsi_col].shift(3)) / 3
except Exception as e: print(f"Error during feature calculation: {e}"); raise e # Kept Error


# === Step 3: Create Target Variable (1-Day) & Drop NaNs ===  <--- MODIFIED COMMENT
# print("\nCreating target variable (1-day forecast) and dropping NaNs...") # Modified comment
if 'Close' not in btc_data.columns: raise ValueError("'Close' column missing.")

btc_data_1d = btc_data.copy() # <--- MODIFIED DATAFRAME NAME
btc_data_1d['Close_Next_1'] = btc_data_1d['Close'].shift(-1) # <-- MODIFIED COLUMN NAME and Ensure shift is -1 for 1-day target
btc_data_1d['Target'] = (btc_data_1d['Close_Next_1'] > btc_data_1d['Close']).astype(int) # <-- MODIFIED TO USE Close_Next_1

initial_rows = len(btc_data_1d) # <--- MODIFIED DATAFRAME NAME
btc_data_1d.dropna(inplace=True) # Remove NaN <--- MODIFIED DATAFRAME NAME
# print(f"Dropped {initial_rows - len(btc_data_1d)} total rows containing NaNs.") # Removed
if btc_data_1d.empty: raise ValueError("DataFrame empty after dropping NaNs.") # <--- MODIFIED DATAFRAME NAME


# === Step 4: Define Features & Split Data ===
# print("\nDefining feature set (Google Trend Included) and splitting data...") # Removed
if 'Target' not in btc_data_1d.columns: raise ValueError("Target column missing.") # <--- MODIFIED DATAFRAME NAME
y = btc_data_1d['Target'] # <--- MODIFIED DATAFRAME NAME

all_potential_features = [
    'Daily_Return', 'High_Low_Range', 'RSI_14', 'MACD_12_26_9', 'MACDh_12_26_9',
    'MACDs_12_26_9', 'EMA_5', 'EMA_20', 'BBL_20_2.0', 'BBM_20_2.0', 'BBU_20_2.0',
    'BBB_20_2.0', 'BBP_20_2.0', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'OBV',
    'ADX_14', 'DMP_14', 'DMN_14', 'Hist_Vol_30', 'RSI_gradient',
    'ATRr_14', # Placeholder
    'ROC_10', 'WILLR_14', 'CMF_20', # Previously added features
    'Bitcoin_Trend'
]

# Ensure correct RSI column name is used if pandas_ta changed it (e.g., RSI_14_x)
rsi_col_actual = next((col for col in btc_data_1d.columns if 'RSI' in col.upper() and '14' in col), 'RSI_14') # <--- MODIFIED DATAFRAME NAME
if 'RSI_14' in all_potential_features and rsi_col_actual != 'RSI_14':
    all_potential_features[all_potential_features.index('RSI_14')] = rsi_col_actual

# Ensure correct ATR column name is used (e.g., ATRr_14)
atr_col_actual = next((col for col in btc_data_1d.columns if 'ATR' in col.upper() and '14' in col), 'ATRr_14') # <--- MODIFIED DATAFRAME NAME
if 'ATRr_14' in all_potential_features and atr_col_actual != 'ATRr_14':
    all_potential_features[all_potential_features.index('ATRr_14')] = atr_col_actual


X_cols = [col for col in all_potential_features if col in btc_data_1d.columns] # <--- MODIFIED DATAFRAME NAME


if 'Bitcoin_Trend' not in btc_data_1d.columns: # <--- MODIFIED DATAFRAME NAME
    print("Warning: 'Bitcoin_Trend' feature not found, excluding it.") # Kept Warning
    if 'Bitcoin_Trend' in X_cols: X_cols.remove('Bitcoin_Trend')
elif 'Bitcoin_Trend' not in X_cols: # Add if found but not in list for some reason
    X_cols.append('Bitcoin_Trend')


# print(f"Features used for X: {X_cols}") # Removed
# print(f"Number of features: {len(X_cols)}") # Removed
X = btc_data_1d[X_cols] # <--- MODIFIED DATAFRAME NAME

# Split data (70/15/15)
train_size_pct = 0.70; val_size_pct = 0.15; n_total = len(X)
n_train = int(n_total * train_size_pct); n_val = int(n_total * val_size_pct); n_test = n_total - n_train - n_val
if n_train <= 0 or n_val <= 0 or n_test <= 0: raise ValueError(f"Invalid split sizes for {n_total} samples.")
X_train, y_train = X.iloc[:n_train], y.iloc[:n_train]
X_val, y_val = X.iloc[n_train:n_train + n_val], y.iloc[n_train:n_train + n_val]
X_test, y_test = X.iloc[n_train + n_val:], y.iloc[n_train + n_val:]
# print(f"Data split into Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}") # Removed

# === Step 5: Apply Scaling ===
# (No changes needed in Step 5, as it uses X_train, X_val, X_test which are already defined based on the modified X)
# print("\nApplying StandardScaler...") # Removed
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
# print("Scaling complete.") # Removed

# === Step 6: Train Tuned XGBoost Model ===
# (No changes needed in Step 6, model trains on the prepared scaled data)
# print("\nTraining Tuned XGBoost model (Google Trend Included)...") # Removed
best_params_xgb = {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0.01, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 0.8}
xgb_specific_params = {k: v for k, v in best_params_xgb.items() if k in xgb.XGBClassifier().get_params()}
# print(f"Using Tuned XGBoost Params: {xgb_specific_params}") # Removed
model_with_trend = xgb.XGBClassifier( objective='binary:logistic', use_label_encoder=False, **xgb_specific_params, random_state=42)

try:
    # Train the model
    model_with_trend.fit(X_train_scaled, y_train, verbose=False)
    # print("Training complete.") # Removed

    # === Step 7: Evaluate on Test Set ===
    # (No changes needed in Step 7, evaluation uses the trained model and test sets)
    # print("Making predictions on Test Set...") # Removed
    y_pred_test_trend = model_with_trend.predict(X_test_scaled)
    y_pred_proba_test_trend = model_with_trend.predict_proba(X_test_scaled)[:, 1]

    # --- Keep Final Output ---
    # Modify the title print statement if desired for clarity
    print("\n--- FINAL Model Evaluation on TEST SET (1-Day Prediction, Google Trend Included) ---") # <-- MODIFIED TITLE
    print(f"Accuracy: {accuracy_score(y_test, y_pred_test_trend):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred_test_trend, zero_division=0):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred_test_trend, zero_division=0):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred_test_trend, zero_division=0):.4f}")
    try:
        if len(np.unique(y_test)) > 1:
            roc_auc = roc_auc_score(y_test, y_pred_proba_test_trend)
            print(f"ROC-AUC Score: {roc_auc:.4f}")
        else: print("ROC-AUC Score: Not defined (only one class in test set)") # Modified for clarity
    except ValueError as e: print(f"Could not calculate ROC-AUC: {e}") # Kept Error

    # --- Feature Importance ---
    # Modify the title print statement if desired for clarity
    print("\nFeature Importances (1-Day Prediction Model, Google Trend Included):") # <-- MODIFIED TITLE
    importances_trend = model_with_trend.feature_importances_
    feature_names_trend = X_train_scaled.columns
    final_importances_trend = pd.Series(importances_trend, index=feature_names_trend).sort_values(ascending=False)
    print(final_importances_trend) # Keep feature importance list

    # Plot
    plt.figure(figsize=(10, 9))
    try:
        sns.barplot(x=final_importances_trend, y=final_importances_trend.index)
        # Modify plot title if desired
        plt.title('Feature Importances (1-Day Prediction, Google Trend Included)') # <-- MODIFIED TITLE
        plt.xlabel('Importance Score')
        plt.ylabel('Features')
        plt.tight_layout()
        plt.show()
    except NameError: print("\n(Seaborn not imported, skipping plot)") # Kept Warning

# Keep Error Handling Blocks
except NameError as e: print(f"Error: Required variables not found. Details: {e}")
except KeyError as e: print(f"Error: Column key error. Details: {e}")
except ValueError as e: print(f"Error: Value error (check data/split). Details: {e}") # Added more specific error type
except Exception as e: print(f"An error occurred: {e}")


Fetching Google Trends data for 'Bitcoin'...
Attempt 1 of 5 to fetch Google Trends data...
Attempt 1 failed: The request failed: Google returned a response with code 429
Retrying in 60 seconds...
