In [24]:
import yfinance as yf

# Example: get 1 month of daily data for Apple
data = yf.download(tickers="AAPL", period="1mo", interval="1h")  # 1-hour interval
data.dropna(inplace=True)
print(data.head())


  data = yf.download(tickers="AAPL", period="1mo", interval="1h")  # 1-hour interval
[*********************100%***********************]  1 of 1 completed

Price                           Close        High         Low        Open  \
Ticker                           AAPL        AAPL        AAPL        AAPL   
Datetime                                                                    
2025-09-04 13:30:00+00:00  237.690002  239.893204  237.039993  238.449997   
2025-09-04 14:30:00+00:00  237.399994  238.100006  237.074997  237.679993   
2025-09-04 15:30:00+00:00  237.160004  237.674606  236.740005  237.389999   
2025-09-04 16:30:00+00:00  238.039993  238.089996  236.970001  237.149994   
2025-09-04 17:30:00+00:00  238.059998  238.389999  237.509995  238.040100   

Price                        Volume  
Ticker                         AAPL  
Datetime                             
2025-09-04 13:30:00+00:00  12016131  
2025-09-04 14:30:00+00:00   5223309  
2025-09-04 15:30:00+00:00   3678939  
2025-09-04 16:30:00+00:00   3071522  
2025-09-04 17:30:00+00:00   5093419  





In [25]:
import pandas as pd

df = data.copy()
# Price movement features
df['Price_Change'] = df['Close'] - df['Open']
df['High_Low'] = df['High'] - df['Low']
df['MA5'] = df['Close'].rolling(5).mean()
df['MA10'] = df['Close'].rolling(10).mean()
df['Volume_Change'] = df['Volume'].pct_change()

# Target variable: will price go UP in next period?
df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)

# Drop NaNs generated by rolling / shift
df.dropna(inplace=True)

print(df.head())


Price                           Close        High         Low        Open  \
Ticker                           AAPL        AAPL        AAPL        AAPL   
Datetime                                                                    
2025-09-05 15:30:00+00:00  238.770004  240.315002  238.520004  239.820007   
2025-09-05 16:30:00+00:00  239.625000  239.725006  238.490097  238.779999   
2025-09-05 17:30:00+00:00  239.119995  239.649994  238.729996  239.648499   
2025-09-05 18:30:00+00:00  239.925003  240.130005  239.100006  239.119995   
2025-09-05 19:30:00+00:00  239.669998  240.014999  239.059998  239.929993   

Price                       Volume Price_Change  High_Low         MA5  \
Ticker                        AAPL                                      
Datetime                                                                
2025-09-05 15:30:00+00:00  4219870    -1.050003  1.794998  239.254901   
2025-09-05 16:30:00+00:00  3993359     0.845001  1.234909  239.439001   
2025-09-05 17:30:0

In [33]:
print(df.columns.tolist())


['Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL', 'Volume_AAPL', 'Price_Change_', 'High_Low_', 'MA5_', 'MA10_', 'Volume_Change_', 'Target_']


In [35]:
print(df.head())


                           Close_AAPL   High_AAPL    Low_AAPL   Open_AAPL  \
Datetime                                                                    
2025-09-05 15:30:00+00:00  238.770004  240.315002  238.520004  239.820007   
2025-09-05 16:30:00+00:00  239.625000  239.725006  238.490097  238.779999   
2025-09-05 17:30:00+00:00  239.119995  239.649994  238.729996  239.648499   
2025-09-05 18:30:00+00:00  239.925003  240.130005  239.100006  239.119995   
2025-09-05 19:30:00+00:00  239.669998  240.014999  239.059998  239.929993   

                           Volume_AAPL  Price_Change_  High_Low_        MA5_  \
Datetime                                                                       
2025-09-05 15:30:00+00:00      4219870      -1.050003   1.794998  239.254901   
2025-09-05 16:30:00+00:00      3993359       0.845001   1.234909  239.439001   
2025-09-05 17:30:00+00:00      2784543      -0.528503   0.919998  239.320999   
2025-09-05 18:30:00+00:00      5197093       0.805008   1.02

In [27]:
# Flatten multi-level columns if necessary
if isinstance(df.columns, pd.MultiIndex):
    df.columns = ['_'.join(col).strip() for col in df.columns]

# Now clean column names for LightGBM
df.columns = [
    col.replace(" ", "_")
       .replace("(", "")
       .replace(")", "")
       .replace("%", "pct")
       .replace("/", "_")
       .replace("-", "_")
    for col in df.columns
]


In [31]:
# Flatten multi-index if exists
if isinstance(df.columns, pd.MultiIndex):
    df.columns = ['_'.join(col).strip() for col in df.columns]

# Clean names
df.columns = [col.replace(" ", "_").replace("(", "").replace(")", "")
                  .replace("%", "pct").replace("/", "_").replace("-", "_")
              for col in df.columns]

print(df.columns.tolist())  # Verify that all feature names exist


['Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL', 'Volume_AAPL', 'Price_Change_', 'High_Low_', 'MA5_', 'MA10_', 'Volume_Change_', 'Target_']


In [38]:
features = print(df.head())
features = ['Price_Change_', 'High_Low_', 'MA5_', 'MA10_', 'Volume_Change_']
X = df[features]
y = df['Target_']


                           Close_AAPL   High_AAPL    Low_AAPL   Open_AAPL  \
Datetime                                                                    
2025-09-05 15:30:00+00:00  238.770004  240.315002  238.520004  239.820007   
2025-09-05 16:30:00+00:00  239.625000  239.725006  238.490097  238.779999   
2025-09-05 17:30:00+00:00  239.119995  239.649994  238.729996  239.648499   
2025-09-05 18:30:00+00:00  239.925003  240.130005  239.100006  239.119995   
2025-09-05 19:30:00+00:00  239.669998  240.014999  239.059998  239.929993   

                           Volume_AAPL  Price_Change_  High_Low_        MA5_  \
Datetime                                                                       
2025-09-05 15:30:00+00:00      4219870      -1.050003   1.794998  239.254901   
2025-09-05 16:30:00+00:00      3993359       0.845001   1.234909  239.439001   
2025-09-05 17:30:00+00:00      2784543      -0.528503   0.919998  239.320999   
2025-09-05 18:30:00+00:00      5197093       0.805008   1.02

In [39]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

model = LGBMClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Model Accuracy:", acc)


LightGBMError: Do not support special JSON characters in feature name.

In [40]:
print(df.columns.tolist())


['Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL', 'Volume_AAPL', 'Price_Change_', 'High_Low_', 'MA5_', 'MA10_', 'Volume_Change_', 'Target_']


In [41]:
features = ['Price_Change_', 'High_Low_', 'MA5_', 'MA10_', 'Volume_Change_']
X = df[features]
y = df['Target_']


In [42]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train LightGBM
model = LGBMClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


[LightGBM] [Info] Number of positive: 57, number of negative: 59
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 199
[LightGBM] [Info] Number of data points in the train set: 116, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491379 -> initscore=-0.034486
[LightGBM] [Info] Start training from score -0.034486
Accuracy: 0.41379310344827586


In [43]:
# Predict for the latest available data point
latest_features = X.tail(1)
prediction = model.predict(latest_features)[0]
direction = "UP ⬆️" if prediction == 1 else "DOWN ⬇️"
print("Next movement prediction:", direction)


Next movement prediction: DOWN ⬇️
