In [13]:
import yfinance as yf

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import clone

import plotly.express as px
import joblib

from datetime import date, timedelta

In [14]:
date_yesterday = str(date.today() - timedelta(days=1))
df_apple_to_csv = yf.download("AAPL", start="2020-01-01", end=date_yesterday)
df_microsoft_to_csv = yf.download("MSFT", start="2020-01-01", end=date_yesterday)
df_amazon_to_csv = yf.download("AMZN", start="2020-01-01", end=date_yesterday)
df_nvidia_to_csv = yf.download("NVDA", start="2020-01-01", end=date_yesterday)


apple_path = 'datasets/apple.csv'
microsoft_path = 'datasets/microsoft.csv'
amazon_path = 'datasets/amazon.csv'
nvidia_path = 'datasets/nvidia.csv'

df_apple_to_csv.to_csv(apple_path)
df_microsoft_to_csv.to_csv(microsoft_path)
df_amazon_to_csv.to_csv(amazon_path)
df_nvidia_to_csv.to_csv(nvidia_path)

df_apple = pd.read_csv(apple_path)
df_microsoft = pd.read_csv(microsoft_path)
df_amazon = pd.read_csv(amazon_path)
df_nvidia = pd.read_csv(nvidia_path)

  df_apple_to_csv = yf.download("AAPL", start="2020-01-01", end=date_yesterday)
[*********************100%***********************]  1 of 1 completed
  df_microsoft_to_csv = yf.download("MSFT", start="2020-01-01", end=date_yesterday)
[*********************100%***********************]  1 of 1 completed
  df_amazon_to_csv = yf.download("AMZN", start="2020-01-01", end=date_yesterday)
[*********************100%***********************]  1 of 1 completed
  df_nvidia_to_csv = yf.download("NVDA", start="2020-01-01", end=date_yesterday)
[*********************100%***********************]  1 of 1 completed


In [15]:
df_apple['Date'] = df_apple['Price']
df_apple = df_apple.drop(index=[0,1], columns='Price')

df_microsoft['Date'] = df_microsoft['Price']
df_microsoft = df_microsoft.drop(index=[0,1], columns='Price')

df_amazon['Date'] = df_amazon['Price']
df_amazon = df_amazon.drop(index=[0,1], columns='Price')

df_nvidia['Date'] = df_nvidia['Price']
df_nvidia = df_nvidia.drop(index=[0,1], columns='Price')

In [16]:
# #INITIAL
dfs = [df_apple, df_amazon, df_microsoft, df_nvidia]
window = 14
short_period = 12
long_period = 26
signal_line_period = 9
stocks = ['AAPL', 'AMZN', 'MSFT', 'NVDA']
X_train, X_test, y_train, y_test = None, None, None, None

In [17]:
# #CONVERTING TO CORRECT FEATURE TYPES
def convert_type(df):
    df['Close'] = df['Close'].astype(float)
    df['High'] = df['High'].astype(float)
    df['Low'] = df['Low'].astype(float)
    df['Open'] = df['Open'].astype(float)
    df['Volume'] = df['Volume'].astype(int)
    df['Date'] = pd.to_datetime(df['Date'])

In [18]:
# #FEATURE ENGINEERING
def engineer_features(df):
    df['Price Change'] = df['Close'].diff().fillna(0)

    df['Gain'] = df.loc[df['Price Change'] > 0, 'Price Change']
    df['Loss'] = -df.loc[df['Price Change'] < 0, 'Price Change']

    df['Gain'] = df['Gain'].fillna(0)
    df['Loss'] = df['Loss'].fillna(0)

    df['Average Gain'] = df['Gain'].rolling(window=window).mean().fillna(0)
    df['Average Loss'] = df['Loss'].rolling(window=window).mean().fillna(0)

    df['RS'] = df['Average Gain'] / df['Average Loss']

    df['RSI'] = 100 - (100 / (1 + df['RS']))

    df['Short EMA'] = df['Close'].ewm(span=short_period,adjust=False).mean()
    df['Long EMA'] = df['Close'].ewm(span=long_period, adjust=False).mean()

    df['MACD'] = df['Short EMA'] - df['Long EMA']

    df['Signal Line'] = df['MACD'].ewm(span=signal_line_period, adjust=False).mean()
    
    df['MACD Histogram'] = df['MACD'] - df['Signal Line']
    
    return df

In [19]:
# #DROPPING NULL AND UNIMPORTANT COLUMNS/ROWS
def drop_features_and_na(df):
    df = df.drop(columns=['Gain', 'Loss'])
    df = df.dropna()
    return df

In [20]:
# #MODEL TRAINING AND EVALUATION
def split_data(df):
    X = df.drop(columns=['Date', 'Close'])
    y = df['Close']

    return train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
def create_pipeline():
    model = clone(Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LinearRegression())
        ]))
    return model

In [22]:
def evaluate(model):
    kfold = KFold(n_splits=6, random_state=30, shuffle=True)

    params = {'lr__n_jobs': [10,20,50]}
    cv = clone(GridSearchCV(estimator=model,param_grid=params,cv=kfold))

    cv.fit(X_train, y_train)
    
    return {"best_params": cv.best_params_, "best_score": cv.best_score_}, cv

In [23]:
# #MAIN OUTPUT
models = []

for i,df in enumerate(dfs):
    X_train, X_test, y_train, y_test = None, None, None, None
    
    convert_type(df)
    df = engineer_features(df)
    df = drop_features_and_na(df)
    
    X_train, X_test, y_train, y_test = split_data(df)
    
    model = create_pipeline()
    
    evaluation, cv = evaluate(model)
    df = df.drop(columns='Close')
    
    print("Best parameter:", evaluation['best_params']['lr__n_jobs'])
    print("Best score:", evaluation['best_score'])
    
    models.append({'model': cv, 'X_test': X_test,'y_test': y_test})
    
    print(f"{stocks[i]}:")
    print(df.head())
    print()

Best parameter: 10
Best score: 0.9996616101249348
AAPL:
         High        Low       Open     Volume       Date  Price Change  \
15  77.281820  76.634566  76.941285  101832400 2020-01-22      0.272903   
16  77.177986  76.233668  76.781908  104472000 2020-01-23      0.369530   
17  78.088485  76.685293  77.344627  146537600 2020-01-24     -0.222198   
18  75.296594  73.632571  74.883608  161940000 2020-01-27     -2.260559   
19  76.897837  75.398041  75.497062  162234000 2020-01-28      2.110832   

    Average Gain  Average Loss        RS        RSI  Short EMA   Long EMA  \
15      0.508210      0.208906  2.432717  70.868559  75.349466  74.270920   
16      0.534605      0.208906  2.559066  71.902738  75.618516  74.480354   
17      0.534605      0.174406  3.065296  75.401544  75.811989  74.657816   
18      0.493722      0.335874  1.469961  59.513531  75.627918  74.654684   
19      0.644496      0.311552  2.068660  67.412486  75.796910  74.808142   

        MACD  Signal Line  MAC

In [30]:
for i in range(len(models)):
    model_ = models[i]['model']
    X_test_ = models[i]['X_test']
    y_test_ = models[i]['y_test']
    
    y_pred_ = model_.predict(X_test_)
    
    print("MSE:", mean_squared_error(y_true=y_test_, y_pred=y_pred_))
    print("R2 Score:", r2_score(y_true=y_test_, y_pred=y_pred_))

    print(f"Predicted values for {stocks[i]}:")
    print(y_pred_[0:10])

    print(f"Correct values for {stocks[i]}:")
    print(y_test_[0:10])
    
    joblib.dump(model_,f"models/{stocks[i]}.pkl")
    
    if joblib.load(f"models/{stocks[i]}.pkl"):
        print(f"{stocks[i]} model saved!")

MSE: 0.5927515810193685
R2 Score: 0.9996969454528803
Predicted values for AAPL:
[163.62505989 174.85484445 123.93070935 112.0819664  114.16855871
 147.13058517 163.22410147  67.74977164 182.81841995  58.59665352]
Correct values for AAPL:
827     163.506836
931     174.589905
354     123.997078
207     111.901451
218     113.341736
793     147.547241
547     163.503967
82       67.440445
1008    184.081497
66       58.442101
Name: Close, dtype: float64
AAPL model saved!
MSE: 0.7354449399393389
R2 Score: 0.9994600127221722
Predicted values for AMZN:
[102.50968339 140.77282051 163.605142   159.44575732 157.0673248
  94.81967763 151.59360634 116.27112525 149.89633709  95.50291185]
Correct values for AMZN:
827     102.400002
931     141.229996
354     163.257996
207     160.220001
218     157.186996
793      95.820000
547     152.052505
82      115.704002
1008    149.929993
66       95.329498
Name: Close, dtype: float64
AMZN model saved!
MSE: 1.6727236244366988
R2 Score: 0.999804248060394
P