In [None]:
pip install ta



In [None]:
import yfinance as yf
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor

In [None]:
# Fetch stock data using yfinance
def fetch_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    return data

# Create features manually without using the `ta` library
def create_features(data):
    # Moving Averages
    data['MA5'] = data['Close'].rolling(window=5).mean().shift(1)
    data['MA10'] = data['Close'].rolling(window=10).mean().shift(1)
    data['MA20'] = data['Close'].rolling(window=20).mean().shift(1)
    data['MA50'] = data['Close'].rolling(window=50).mean().shift(1)

    # Momentum and Volatility
    data['Momentum'] = data['Close'] - data['Close'].shift(1)
    data['Volatility'] = data['Close'].pct_change().rolling(window=20).std().shift(1)

    # MACD
    data['EMA12'] = data['Close'].ewm(span=12, adjust=False).mean()
    data['EMA26'] = data['Close'].ewm(span=26, adjust=False).mean()
    data['MACD'] = data['EMA12'] - data['EMA26']

    # Lagged Values
    for lag in range(1, 6):
        data[f'Close_Lag{lag}'] = data['Close'].shift(lag)

    # Rolling mean, max, and Bollinger Bands
    data['Rolling_Mean_7'] = data['Close'].rolling(window=7).mean()
    data['Rolling_Max_7'] = data['Close'].rolling(window=7).max()
    data['Upper_Bollinger'] = data['Rolling_Mean_7'] + 2 * data['Volatility']
    data['Lower_Bollinger'] = data['Rolling_Mean_7'] - 2 * data['Volatility']

    data = data.dropna()
    return data

# Prepare data for model
def prepare_data(data):
    features = [
        'MA5', 'MA10', 'MA20', 'MA50', 'Momentum', 'Volatility', 'MACD',
        'Rolling_Mean_7', 'Rolling_Max_7', 'Upper_Bollinger', 'Lower_Bollinger'
    ] + [f'Close_Lag{lag}' for lag in range(1, 6)]

    X = data[features]
    y = data['Close'].values.ravel()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    tscv = TimeSeriesSplit(n_splits=5)
    return X_scaled, y, tscv, scaler

# Model evaluation
def evaluate_model(X, y, model, tscv):
    rmses = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmses.append(rmse)

    return np.mean(rmses)

# Hyperparameter tuning for Random Forest
def tune_random_forest(X, y, tscv):
    param_grid = {
        'n_estimators': [100, 200, 500],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=tscv, n_jobs=-1, verbose=2)
    grid_search.fit(X, y)
    print(f"Best Random Forest Params: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Hyperparameter tuning for SVM
def tune_svm(X, y, tscv):
    param_grid = {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['linear', 'rbf', 'poly']
    }
    svm = SVR()
    grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=tscv, n_jobs=-1, verbose=2)
    grid_search.fit(X, y)
    print(f"Best SVM Params: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Evaluate ARIMA model
def evaluate_arima_model(y, tscv):
    rmses = []
    for train_index, test_index in tscv.split(y):
        y_train, y_test = y[train_index], y[test_index]
        arima_model = ARIMA(y_train, order=(5, 1, 0))  # Adjust based on analysis
        arima_fit = arima_model.fit()

        y_pred = arima_fit.forecast(steps=len(y_test))
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmses.append(rmse)

    return np.mean(rmses)

# Main function to fetch data, generate features, and evaluate models
def main(ticker, start_date, end_date):
    data = fetch_data(ticker, start_date, end_date)
    data = create_features(data)
    X_scaled, y, tscv, scaler = prepare_data(data)

    # Linear Regression (Baseline)
    lr_model = Ridge()
    print(f"Linear Regression RMSE: {evaluate_model(X_scaled, y, lr_model, tscv)}")

    # Random Forest with Hyperparameter Tuning
    rf_model = tune_random_forest(X_scaled, y, tscv)
    print(f"Random Forest RMSE: {evaluate_model(X_scaled, y, rf_model, tscv)}")

    # XGBoost Regressor
    xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, random_state=42)
    print(f"XGBoost RMSE: {evaluate_model(X_scaled, y, xgb_model, tscv)}")

    # SVM with Hyperparameter Tuning
    svm_model = tune_svm(X_scaled, y, tscv)
    print(f"SVM RMSE: {evaluate_model(X_scaled, y, svm_model, tscv)}")

    # Stacking Regressor
    stacking_model = StackingRegressor(
        estimators=[('rf', rf_model), ('xgb', xgb_model), ('svm', svm_model)],
        final_estimator=GradientBoostingRegressor()
    )
    print(f"Stacking Regressor RMSE: {evaluate_model(X_scaled, y, stacking_model, tscv)}")

    # ARIMA Model
    arima_rmse = evaluate_arima_model(y, tscv)
    print(f"ARIMA RMSE: {arima_rmse}")

# Run the code
if __name__ == "__main__":
    ticker = "AAPL"
    start_date = "2018-01-01"
    end_date = "2021-12-31"
    main(ticker, start_date, end_date)


[*********************100%***********************]  1 of 1 completed

Linear Regression RMSE: 1.2339620038797294
Fitting 5 folds for each of 216 candidates, totalling 1080 fits





Best Random Forest Params: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest RMSE: 11.660958758374814
XGBoost RMSE: 12.352171478526191
Stacking Regressor RMSE: 10.615222904839735
ARIMA RMSE: 17.34316360947922
