In [12]:
# Install packages if needed
#!pip install yfinance ta pandas scikit-learn joblib
#!pip install vaderSentiment

# --- 1. Import Libraries ---
import yfinance as yf
import pandas as pd
import numpy as np
import ta
import joblib
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# --- 2. Define Stock Pool ---
TICKERS = ['AAPL', 'GOOGL', 'AMZN', 'MSFT', 'NVDA']
START_DATE = '2018-01-01'
END_DATE = '2025-04-30'

# --- 3. Download Historical Data ---
data = yf.download(TICKERS, start=START_DATE, end=END_DATE, group_by='ticker', auto_adjust=True)

# --- 4. Feature Engineering Function ---
def create_features(data):
    dfs = []

    for ticker in data.columns.levels[0]:
        df = data[ticker].copy()
        df['return_5d'] = df['Close'].pct_change(5)
        df['return_20d'] = df['Close'].pct_change(20)
        df['volatility_20d'] = df['Close'].pct_change().rolling(20).std()
        df['rsi_14'] = ta.momentum.RSIIndicator(df['Close'].squeeze(), window=14).rsi()
        macd = ta.trend.MACD(df['Close'].squeeze())
        df['macd'] = macd.macd()
        df['macd_signal'] = macd.macd_signal()
        bb = ta.volatility.BollingerBands(df['Close'].squeeze())
        df['bollinger_h'] = bb.bollinger_hband()
        df['bollinger_l'] = bb.bollinger_lband()
        df['ticker'] = ticker
        dfs.append(df)

    feature_df = pd.concat(dfs)
    feature_df.reset_index(inplace=True)
    return feature_df

# --- 5. Create Features ---
feature_data = create_features(data)

# --- 6. Create Label (future return over next 20 trading days) ---
feature_data['future_return_20d'] = feature_data.groupby('ticker')['Close'].shift(-20) / feature_data['Close'] - 1

# --- 7. Add Fundamentals ---

# --- Fundamental Data (using yfinance) ---
def get_fundamentals(ticker):
    stock = yf.Ticker(ticker)
    try:
        pe_ratio = stock.info.get('trailingPE', None)
        pb_ratio = stock.info.get('priceToBook', None)
        return pe_ratio, pb_ratio
    except Exception:
        return None, None


# --- Merge with your existing data ---
fundamental_features = []

for ticker in TICKERS:
    pe, pb = get_fundamentals(ticker)
    
    fundamental_features.append({'ticker': ticker, 'pe_ratio': pe, 'pb_ratio': pb})

# Create DataFrames & Merge with Feature Dataframe
fundamentals_df = pd.DataFrame(fundamental_features).set_index('ticker')
feature_data = feature_data.merge(fundamentals_df, on='ticker', how='left')

# --- 8. Prepare Final Dataset ---
feature_cols = [
    'return_5d', 'return_20d', 'volatility_20d', 'rsi_14', 
    'macd', 'macd_signal', 'bollinger_h', 'bollinger_l',
    'pe_ratio', 'pb_ratio'
]
feature_data = feature_data.dropna(subset=feature_cols + ['future_return_20d'])

X = feature_data[feature_cols]
y = feature_data['future_return_20d']

# --- 9. Scale Features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 10. Split Dataset ---
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# --- 11. Train Model ---
rf_model = RandomForestRegressor(n_estimators=300, max_depth=8, random_state=42)
rf_model.fit(X_train, y_train)

# --- 12. Save Model and Scaler ---
joblib.dump(rf_model, 'asset_selection_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
#joblib.dump(get_fundamentals, 'get_fundamentals.pk')
#joblib.dump(get_sentiment, 'get_sentiment.pk')

#! pip install dill

# When saving
import dill
with open('get_fundamentals.pkl', 'wb') as f:
    dill.dump(get_fundamentals, f)
print("✅ Model, scaler and function saved.")   

[*********************100%***********************]  5 of 5 completed


✅ Model, scaler and function saved.


In [11]:
display(feature_data.tail(30))

Unnamed: 0,Date,Open,High,Low,Close,Volume,return_5d,return_20d,volatility_20d,rsi_14,macd,macd_signal,bollinger_h,bollinger_l,ticker,future_return_20d,pe_ratio,pb_ratio,sentiment_score
9155,2025-02-18,141.257015,143.426814,137.91731,139.387177,219176600,0.043647,0.012272,0.051705,57.02436,0.132414,-1.656989,149.541707,112.762169,NVDA,-0.171875,38.945576,35.3286,0
9156,2025-02-19,139.497167,141.347003,137.207384,139.217194,167536000,0.048419,-0.011361,0.051477,56.821154,0.635994,-1.198393,149.306111,112.83778,NVDA,-0.155851,38.945576,35.3286,0
9157,2025-02-20,140.017128,140.647075,136.77742,140.097122,143903600,0.0684,-0.047324,0.050477,57.662221,1.093482,-0.740018,147.96318,113.484774,NVDA,-0.153944,38.945576,35.3286,0
9158,2025-02-21,140.027118,141.447001,134.017676,134.417633,228217600,-0.006357,-0.086877,0.051233,50.785994,0.986388,-0.394737,145.703393,114.465678,NVDA,-0.124371,38.945576,35.3286,0
9159,2025-02-24,136.547442,138.577254,130.068042,130.268021,251381100,-0.061721,-0.086524,0.051223,46.429512,0.560218,-0.203746,143.995882,114.939304,NVDA,-0.067998,38.945576,35.3286,0
9160,2025-02-25,129.968045,130.188026,124.428561,126.618355,271428700,-0.091607,0.06933,0.03383,42.940598,-0.071202,-0.177237,143.573415,116.182697,NVDA,-0.046821,38.945576,35.3286,0
9161,2025-02-26,129.978054,133.717701,128.478192,131.267929,322553800,-0.0571,0.017753,0.028467,48.273393,-0.194187,-0.180627,143.69418,116.290909,NVDA,-0.133376,38.945576,35.3286,0
9162,2025-02-27,134.987587,134.997581,119.998968,120.138954,443175800,-0.14246,-0.028698,0.033181,38.901615,-1.17611,-0.379724,143.924617,115.705505,NVDA,-0.072491,38.945576,35.3286,0
9163,2025-02-28,118.009141,125.078491,116.389295,124.908508,389091100,-0.070743,0.002166,0.034369,43.926001,-1.551544,-0.614088,143.918784,115.738336,NVDA,-0.121997,38.945576,35.3286,0
9164,2025-03-03,123.498643,123.68862,112.269672,114.049507,411381400,-0.124501,-0.050054,0.038793,36.555418,-2.69425,-1.03012,144.656866,114.399308,NVDA,-0.049711,38.945576,35.3286,0


In [5]:
sum(feature_data['sentiment_score'])

0