In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.exceptions import ConvergenceWarning
import yfinance as yf
import sys
sys.path.append('/Users/kailiu/StockMarketPrediction-') 
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)


### single parameter model

In [22]:
df = yf.download(['AAPL', 'TSLA', 'GOOGL', 'AMZN', 'MSFT','NVDA','META'], start='2010-01-01', end='2023-12-31')[['Close']]


df.index.name = 'Date'

# 展平多层列
df.columns = ['_'.join(col).strip() for col in df.columns.values]

# 转换为长格式
df_reset = df.reset_index()
long_df = pd.melt(
    df_reset,
    id_vars=['Date'],
    var_name='Ticker_Price',
    value_name='Close'
)

# 如果需要分离出 Price 和 Ticker 信息
long_df[['Type', 'Ticker']] = long_df['Ticker_Price'].str.split('_', expand=True)
long_df = long_df.drop(columns=['Ticker_Price'])


long_df.drop(columns=['Type','Ticker'], inplace=True)
long_df.set_index('Date', inplace=True)
df = long_df.copy()
df_multiple = df.copy()
df.head()

[*********************100%***********************]  7 of 7 completed


Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2010-01-04,7.643214
2010-01-05,7.656429
2010-01-06,7.534643
2010-01-07,7.520714
2010-01-08,7.570714


In [20]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
# Assuming 'df' is your DataFrame with a 'Close' column
# Create lag features for the last 10 days
n_lags = 10
for lag in range(1, n_lags + 1):
    df[f'Lag_{lag}'] = df['Close'].shift(lag)

df['target'] = df['Close'].shift(-1)
# Drop rows with NaN values (due to lagging)
df.dropna(inplace=True)

# Prepare the dataset
X = df[[f'Lag_{lag}' for lag in range(1, n_lags + 1)]]
y = df['target']



# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

# Predict the next day's value using the last 10 days of data
last_10_days = df['Close'].iloc[-n_lags:].values.reshape(1, -1)
next_day_prediction = model.predict(last_10_days)
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2:.2f}')
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae:.2f}')




Mean Squared Error: 30.29
R-squared: 1.00
Mean Absolute Error: 2.22




### multiple parameter model

In [23]:
# calculate macd
from utils.fianacialtools import calculate_macd, generate_signals




df_multiple['EMA'] = df_multiple['Close'].ewm(span=10, adjust=False).mean()

df_multiple['MACD'] = calculate_macd(df_multiple)

df_multiple = generate_signals(df_multiple)

df_multiple.head()



Unnamed: 0_level_0,Close,EMA,MACD,RSI,Buy_Signal,Sell_Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,7.643214,7.643214,0.0,,0,0
2010-01-05,7.656429,7.645617,0.001054,100.0,0,1
2010-01-06,7.534643,7.62544,-0.007847,9.788567,1,0
2010-01-07,7.520714,7.606399,-0.015843,8.873044,1,0
2010-01-08,7.570714,7.599911,-0.017938,31.777433,0,0


In [24]:
# set lookback days
lookback_days = 10

# create lag features
for feature in ['Close', 'EMA', 'MACD', 'RSI', 'Buy_Signal', 'Sell_Signal']:
    for lag in range(1, lookback_days + 1):
        df_multiple[f'{feature}_lag_{lag}'] = df_multiple[feature].shift(lag)


df_multiple['target'] = df_multiple['Close'].shift(-1)

df_multiple.dropna(inplace=True)

# Check for remaining NaN values
if df.isnull().values.any():
    print("There are still NaN values in the DataFrame.")
else:
    print("All NaN values have been removed.")


There are still NaN values in the DataFrame.


In [26]:

# prepare features and target
features = [col for col in df_multiple.columns if col not in ['Close', 'target', 'Date']]  # 确保不包含目标列和日期列

X = df_multiple.drop(columns=['target'])
print(X.columns)
y = df_multiple['target']

# split data into training and testing sets
from sklearn.model_selection import train_test_split
X_multiple_train, X_multiple_test, y_multiple_train, y_multiple_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# train model
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.01, random_state=42)
model.fit(X_multiple_train, y_multiple_train)

# predict
y_multiple_pred = model.predict(X_multiple_test)



Index(['Close', 'EMA', 'MACD', 'RSI', 'Buy_Signal', 'Sell_Signal',
       'Close_lag_1', 'Close_lag_2', 'Close_lag_3', 'Close_lag_4',
       'Close_lag_5', 'Close_lag_6', 'Close_lag_7', 'Close_lag_8',
       'Close_lag_9', 'Close_lag_10', 'EMA_lag_1', 'EMA_lag_2', 'EMA_lag_3',
       'EMA_lag_4', 'EMA_lag_5', 'EMA_lag_6', 'EMA_lag_7', 'EMA_lag_8',
       'EMA_lag_9', 'EMA_lag_10', 'MACD_lag_1', 'MACD_lag_2', 'MACD_lag_3',
       'MACD_lag_4', 'MACD_lag_5', 'MACD_lag_6', 'MACD_lag_7', 'MACD_lag_8',
       'MACD_lag_9', 'MACD_lag_10', 'RSI_lag_1', 'RSI_lag_2', 'RSI_lag_3',
       'RSI_lag_4', 'RSI_lag_5', 'RSI_lag_6', 'RSI_lag_7', 'RSI_lag_8',
       'RSI_lag_9', 'RSI_lag_10', 'Buy_Signal_lag_1', 'Buy_Signal_lag_2',
       'Buy_Signal_lag_3', 'Buy_Signal_lag_4', 'Buy_Signal_lag_5',
       'Buy_Signal_lag_6', 'Buy_Signal_lag_7', 'Buy_Signal_lag_8',
       'Buy_Signal_lag_9', 'Buy_Signal_lag_10', 'Sell_Signal_lag_1',
       'Sell_Signal_lag_2', 'Sell_Signal_lag_3', 'Sell_Signal_lag_4',
   

In [27]:
# evaluate model
multiple_mse = mean_squared_error(y_multiple_test, y_multiple_pred)
print(f'Mean Squared Error: {multiple_mse:.2f}')
multiple_mae = mean_absolute_error(y_multiple_test, y_multiple_pred)
print(f'Mean Absolute Error: {multiple_mae:.2f}')
multiple_r2 = r2_score(y_multiple_test, y_multiple_pred)
print(f'R-squared: {multiple_r2:.2f}')




Mean Squared Error: 15.55
Mean Absolute Error: 1.53
R-squared: 1.00


In [28]:
#save the model
import joblib
joblib.dump(model, '../models/AAPL_multiple_parameter_model.pkl')


['../models/AAPL_multiple_parameter_model.pkl']