In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

In [2]:
# Function to fetch data from yfinance
def fetch_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    data.index = data.index.tz_localize(None)
    
    if data.empty:
        raise ValueError(
            "No data returned. Please check the ticker symbol and date range."
        )
    return data

In [3]:
# Function to create features similar to your posted code
def create_features(data):
    # Moving Averages with Lagging
    data["MA5"] = data["Close"].rolling(window=5).mean().shift(1)
    data["MA10"] = data["Close"].rolling(window=10).mean().shift(1)
    data["MA20"] = data["Close"].rolling(window=20).mean().shift(1)
    data["MA50"] = data["Close"].rolling(window=50).mean().shift(1)

    # Momentum and Volatility
    data["Momentum"] = data["Close"].diff().shift(1)
    data["Volatility"] = data["Close"].pct_change().rolling(window=20).std().shift(1)

    # MACD
    data["EMA12"] = data["Close"].ewm(span=12, adjust=False).mean().shift(1)
    data["EMA26"] = data["Close"].ewm(span=26, adjust=False).mean().shift(1)
    data["MACD"] = (data["EMA12"] - data["EMA26"]).shift(1)

    # Lagged values for Close
    for lag in range(1, 6):
        data[f"Close_Lag{lag}"] = data["Close"].shift(lag)

    # Bollinger Bands
    data["Rolling_Mean_7"] = data["Close"].rolling(window=7).mean().shift(1)
    data["Rolling_Std_7"] = data["Close"].rolling(window=7).std().shift(1)
    data["Upper_Bollinger"] = data["Rolling_Mean_7"] + 2 * data["Rolling_Std_7"]
    data["Lower_Bollinger"] = data["Rolling_Mean_7"] - 2 * data["Rolling_Std_7"]

    return data.dropna()

In [4]:
# SVM prediction function
def svm_prediction(ticker, date, start_date="2018-01-01", end_date="2020-01-01"):
    # Fetch historical data from yfinance
    df = fetch_data(ticker, start_date, end_date)
    df.index = pd.to_datetime(df.index)
    df.reset_index(inplace=True)
    df = df.rename(columns={"index": "Date"})

    # Create features
    df = create_features(df)

    # Filter to data before the prediction date for training
    train_data = df[df["Date"] < pd.to_datetime(date)]
    # train_data = df[df["Date"] < pd.to_datetime(date).tz_localize('UTC')]


    # Define features and target
    X = train_data.drop(columns=["Date", "Close"])  # Exclude non-feature columns
    y = train_data["Close"]

    # Train-Test split for SVM using TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)
    best_rmse = float("inf")
    best_model = None

    for train_idx, test_idx in tscv.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Define pipeline with scaling and SVM
        pipeline = Pipeline(
            [
                ("scaler", StandardScaler()),
                ("svm", SVR(C=1.0, gamma="scale", kernel="linear")),
            ]
        )
        pipeline.fit(X_train, y_train)

        # Evaluate performance
        y_pred = pipeline.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = pipeline

    # Use the best model to predict the next day’s price
    latest_data = create_features(df[df["Date"] <= pd.to_datetime(date)].tail(1))
    if latest_data.empty:
        raise ValueError("Insufficient data for SVM prediction.")

    next_day_price = best_model.predict(latest_data.drop(columns=["Date", "Close"]))[0]
    return round(next_day_price, 2)

In [5]:
import datetime

ticker = "AAPL"
# date should be 2020-01-02
date = datetime.datetime(2020, 1, 2)
date

datetime.datetime(2020, 1, 2, 0, 0)

In [6]:
svm_forecast = svm_prediction(ticker, date)
svm_forecast

[*********************100%***********************]  1 of 1 completed
  X = train_data.drop(columns=["Date", "Close"])  # Exclude non-feature columns
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ValueError: Insufficient data for SVM prediction.