In [12]:
# Install packages if needed
#!pip install yfinance ta pandas scikit-learn joblib
#!pip install vaderSentiment

# --- 1. Import Libraries ---
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error,r2_score
import yfinance as yf
import pandas as pd
import numpy as np
import ta
import joblib
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import mlflow
import dagshub
from mlflow.models import infer_signature


# --- 2. Define Stock Pool ---
TICKERS = ['AAPL', 'GOOGL', 'AMZN', 'MSFT', 'NVDA']
START_DATE = '2018-01-01'
END_DATE = '2025-04-30'

# --- 3. Download Historical Data ---
data = yf.download(TICKERS, start=START_DATE, end=END_DATE, group_by='ticker', auto_adjust=True)

# --- 4. Feature Engineering Function ---
def create_features(data):
    dfs = []

    for ticker in data.columns.levels[0]:
        df = data[ticker].copy()
        df['return_5d'] = df['Close'].pct_change(5)
        df['return_20d'] = df['Close'].pct_change(20)
        df['volatility_20d'] = df['Close'].pct_change().rolling(20).std()
        df['rsi_14'] = ta.momentum.RSIIndicator(df['Close'].squeeze(), window=14).rsi()
        macd = ta.trend.MACD(df['Close'].squeeze())
        df['macd'] = macd.macd()
        df['macd_signal'] = macd.macd_signal()
        bb = ta.volatility.BollingerBands(df['Close'].squeeze())
        df['bollinger_h'] = bb.bollinger_hband()
        df['bollinger_l'] = bb.bollinger_lband()
        df['ticker'] = ticker
        dfs.append(df)

    feature_df = pd.concat(dfs)
    feature_df.reset_index(inplace=True)
    return feature_df

# --- 5. Create Features ---
feature_data = create_features(data)

# --- 6. Create Label (future return over next 20 trading days) ---
feature_data['future_return_20d'] = feature_data.groupby('ticker')['Close'].shift(-20) / feature_data['Close'] - 1

# --- 7. Add Fundamentals ---

# --- Fundamental Data (using yfinance) ---
def get_fundamentals(ticker):
    import yfinance as yf
    stock = yf.Ticker(ticker)
    try:
        pe_ratio = stock.info.get('trailingPE', None)
        pb_ratio = stock.info.get('priceToBook', None)
        return pe_ratio, pb_ratio
    except Exception:
        return None, None


# --- Merge with your existing data ---
fundamental_features = []

for ticker in TICKERS:
    pe, pb = get_fundamentals(ticker)
    
    fundamental_features.append({'ticker': ticker, 'pe_ratio': pe, 'pb_ratio': pb})

# Create DataFrames & Merge with Feature Dataframe
fundamentals_df = pd.DataFrame(fundamental_features).set_index('ticker')
feature_data = feature_data.merge(fundamentals_df, on='ticker', how='left')

# --- 8. Prepare Final Dataset ---
feature_cols = [
    'return_5d', 'return_20d', 'volatility_20d', 'rsi_14', 
    'macd', 'macd_signal', 'bollinger_h', 'bollinger_l',
    'pe_ratio', 'pb_ratio'
]
feature_data = feature_data.dropna(subset=feature_cols + ['future_return_20d'])

X = feature_data[feature_cols]
y = feature_data['future_return_20d']

# --- 9. Scale Features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 10. Split Dataset ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

models = [
    (
        "Random Forest Regression", 
        {"n_estimators":300, "max_depth":8, "random_state":42},
        RandomForestRegressor(), 
    ),
        (
        "Elastic Net", 
        {"alpha":1, "l1_ratio":0.4,"random_state":42},
        ElasticNet(),
    ),
]
dagshub.init(repo_owner='iannyFARUE', repo_name='stock-predictions', mlflow=True)
for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    
    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        mlflow.log_param("n_tickers",len(TICKERS))
        
        mlflow.log_param("train_size", X_train.shape[0])
        mlflow.log_param("test_size", X_test.shape[0])
        mlflow.log_param("n_features", X_train.shape[1])
        
        model.set_params(**params)
        model.fit(X_train,y_train)
        
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        print(f"\n{model_name} Results:")
        print(f"Mean Squared Error: {mse:.6f}")
        print(f"Root Mean Squared Error: {rmse:.6f}")
        print(f"R² Score: {r2:.6f}")
        
        # Log metrics to MLflow
        # mlflow.log_metric(f"{model_name.lower().replace(' ', '_')}_mse", mse)
        # mlflow.log_metric(f"{model_name.lower().replace(' ', '_')}_rmse", rmse)
        # mlflow.log_metric(f"{model_name.lower().replace(' ', '_')}_r2", r2)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        
        # Log model to MLflow
        signature = infer_signature(X_train, model.predict(X_train))
        if model_name in ["Random Forest Regression","Elastic Net"]:
            mlflow.sklearn.log_model(
                sk_model=model, 
                registered_model_name=f"{model_name.lower().replace(' ', '_')}",
                input_example=X_train,
                signature=signature,
                artifact_path="stock-models"
                )
        else:
            # mlflow.sklearn.log_model(model, "linear_regression_model")
            pass
        
# # --- 11. Train Model ---
# rf_model = RandomForestRegressor(n_estimators=300, max_depth=8, random_state=42)
# rf_model.fit(X_train, y_train)

# # --- 12. Save Model and Scaler ---
# joblib.dump(rf_model, 'asset_selection_model.pkl')
# joblib.dump(scaler, 'scaler.pkl')
# #joblib.dump(get_fundamentals, 'get_fundamentals.pk')
# #joblib.dump(get_sentiment, 'get_sentiment.pk')

# #! pip install dill

# # When saving
# import dill
# with open('get_fundamentals.pkl', 'wb') as f:
#     dill.dump(get_fundamentals, f)
# print("✅ Model, scaler and function saved.")   

[*********************100%***********************]  5 of 5 completed



Random Forest Regression Results:
Mean Squared Error: 0.023883
Root Mean Squared Error: 0.154540
R² Score: -0.146481


Successfully registered model 'random_forest_regression'.
2025/05/05 10:52:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_regression, version 1
Created version '1' of model 'random_forest_regression'.


🏃 View run Random Forest Regression at: https://dagshub.com/iannyFARUE/stock-predictions.mlflow/#/experiments/0/runs/0e38c874050d4f7aa681e4bf6dbea177
🧪 View experiment at: https://dagshub.com/iannyFARUE/stock-predictions.mlflow/#/experiments/0

Elastic Net Results:
Mean Squared Error: 0.021490
Root Mean Squared Error: 0.146595
R² Score: -0.031628


Successfully registered model 'elastic_net'.
2025/05/05 10:52:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: elastic_net, version 1
Created version '1' of model 'elastic_net'.


🏃 View run Elastic Net at: https://dagshub.com/iannyFARUE/stock-predictions.mlflow/#/experiments/0/runs/474c8231ff044e36b6f71c1b48579c8b
🧪 View experiment at: https://dagshub.com/iannyFARUE/stock-predictions.mlflow/#/experiments/0


In [5]:
display(feature_data.tail(5))

Unnamed: 0,Date,Open,High,Low,Close,Volume,return_5d,return_20d,volatility_20d,rsi_14,macd,macd_signal,bollinger_h,bollinger_l,ticker,future_return_20d,pe_ratio,pb_ratio
9180,2025-03-25,120.550003,121.290001,118.919998,120.690002,167447200,0.045569,-0.046821,0.042561,49.678219,-2.188894,-3.355111,128.117841,106.521466,NVDA,-0.148977,38.945576,35.3286
9181,2025-03-26,118.730003,118.839996,112.709999,113.760002,293463300,-0.031995,-0.133376,0.043308,42.799081,-2.411668,-3.166422,125.228272,107.660242,NVDA,-0.064434,38.945576,35.3286
9182,2025-03-27,111.349998,114.449997,110.660004,111.43,236902100,-0.0599,-0.072491,0.039378,40.755634,-2.744592,-3.082056,124.880047,107.137572,NVDA,-0.003769,38.945576,35.3286
9183,2025-03-28,111.489998,112.870003,109.07,109.669998,229872500,-0.068224,-0.121997,0.038146,39.231897,-3.114551,-3.088555,123.5277,106.966068,NVDA,-0.008571,38.945576,35.3286
9184,2025-03-31,105.129997,110.959999,103.650002,108.379997,299212700,-0.107322,-0.049711,0.033099,38.107313,-3.471818,-3.165208,123.760827,106.16599,NVDA,0.005905,38.945576,35.3286


In [5]:
sum(feature_data['sentiment_score'])

0