In [4]:
import json 
import pandas as pd 
import requests 
import yfinance as yf 
import os
import plotly.express as px 
from datetime import datetime
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt

In [46]:
ticker = ["AAPL"] 

start_date = "2021-05-28"
end_date = "2024-05-28"

data = yf.download(ticker, start=start_date, end=end_date)

df = pd.DataFrame(data).reset_index()

df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-05-28,125.570000,125.800003,124.550003,124.610001,122.516022,71311100
1,2021-06-01,125.080002,125.349998,123.940002,124.279999,122.191574,67637100
2,2021-06-02,124.279999,125.239998,124.050003,125.059998,122.958450,59278900
3,2021-06-03,124.680000,124.849998,123.129997,123.540001,121.463974,76229200
4,2021-06-04,124.070000,126.160004,123.849998,125.889999,123.774483,75169300
...,...,...,...,...,...,...,...
748,2024-05-20,189.330002,191.919998,189.009995,191.039993,191.039993,44361300
749,2024-05-21,191.089996,192.729996,190.919998,192.350006,192.350006,42309400
750,2024-05-22,192.270004,192.820007,190.270004,190.899994,190.899994,34648500
751,2024-05-23,190.979996,191.000000,186.630005,186.880005,186.880005,51005900


In [47]:
def add_bias_column(X):
    """
    Args:
        X (array): can be either 1-d or 2-d
    
    Returns:
        Xnew (array): the same array, but 2-d with a column of 1's in the first spot
    """
    
    # If the array is 1-d
    if len(X.shape) == 1:
        Xnew = np.column_stack([np.ones(X.shape[0]), X])
    
    # If the array is 2-d
    elif len(X.shape) == 2:
        bias_col = np.ones((X.shape[0], 1))
        Xnew = np.hstack([bias_col, X])
        
    else:
        raise ValueError("Input array must be either 1-d or 2-d")

    return Xnew

In [48]:
def line_of_best_fit(X, Y): 
    X_into_array = add_bias_column(X)
    XtXinv = np.linalg.inv(np.matmul(X_into_array.T, X_into_array))
    m = np.matmul(XtXinv, np.matmul(X_into_array.T, Y))
    return m

### Creating the line of best fit equation

In [89]:
X = np.array(df.index)
Y = np.array(df_reset['Close'])
equation = line_of_best_fit(X, Y)
equation

array([1.42741982e+02, 5.36644130e-02])

In [90]:
y_graph = []
for i in range(0, 753):
    y_graph.append(equation[0] + (equation[1] * i))
y_graph

[142.74198155891585,
 142.79564597189255,
 142.84931038486923,
 142.90297479784593,
 142.9566392108226,
 143.0103036237993,
 143.063968036776,
 143.11763244975268,
 143.17129686272938,
 143.22496127570608,
 143.27862568868275,
 143.33229010165945,
 143.38595451463613,
 143.43961892761283,
 143.49328334058953,
 143.5469477535662,
 143.6006121665429,
 143.65427657951957,
 143.70794099249628,
 143.76160540547298,
 143.81526981844965,
 143.86893423142635,
 143.92259864440302,
 143.97626305737973,
 144.02992747035643,
 144.0835918833331,
 144.1372562963098,
 144.1909207092865,
 144.24458512226317,
 144.29824953523988,
 144.35191394821655,
 144.40557836119325,
 144.45924277416995,
 144.51290718714662,
 144.56657160012332,
 144.6202360131,
 144.6739004260767,
 144.7275648390534,
 144.78122925203007,
 144.83489366500677,
 144.88855807798348,
 144.94222249096015,
 144.99588690393685,
 145.04955131691352,
 145.10321572989022,
 145.15688014286692,
 145.2105445558436,
 145.2642089688203,
 145.3178

In [55]:
df['y'] = y_graph 
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,y
0,2021-05-28,125.570000,125.800003,124.550003,124.610001,122.516022,71311100,142.741982
1,2021-06-01,125.080002,125.349998,123.940002,124.279999,122.191574,67637100,142.795646
2,2021-06-02,124.279999,125.239998,124.050003,125.059998,122.958450,59278900,142.849310
3,2021-06-03,124.680000,124.849998,123.129997,123.540001,121.463974,76229200,142.902975
4,2021-06-04,124.070000,126.160004,123.849998,125.889999,123.774483,75169300,142.956639
...,...,...,...,...,...,...,...,...
748,2024-05-20,189.330002,191.919998,189.009995,191.039993,191.039993,44361300,182.882962
749,2024-05-21,191.089996,192.729996,190.919998,192.350006,192.350006,42309400,182.936627
750,2024-05-22,192.270004,192.820007,190.270004,190.899994,190.899994,34648500,182.990291
751,2024-05-23,190.979996,191.000000,186.630005,186.880005,186.880005,51005900,183.043956


In [88]:
candlestick = go.Candlestick(x=df["Date"], open=df["Open"], high=df["High"], low = df["Low"], close = df["Close"], increasing_line_color = "green", decreasing_line_color = "red")
line_graph = go.Scatter(x=df["Date"], y = df["y"], mode = 'lines', name = 'Line Chart')

figure = go.Figure(data=[candlestick, line_graph])
politican_bought = "2023-07-20"
figure.update_layout(title = f"{ticker} stock price", yaxis_title = "Price", xaxis_rangeslider_visible=False, shapes = [dict(
        x0=f"{politican_bought}", x1=f"{politican_bought}", y0=0, y1=1, xref='x', yref='paper',
        line_width=2)],
    annotations=[dict(
        x=f'{start_date}', y=0.05, xref='x', yref='paper',
        showarrow=False, xanchor='left')])
figure.show()

In [106]:
from sklearn.metrics import r2_score

def linreg_predict(Xnew, ynew, m):
    """
    Predicts the linear regression and gives it a score on how well the variance is explained
    Args: 
        Xnew (array): either 1-d or 2-d, includes all p predictor features
        ynew (array): 1-d array, includes all correspondign response values to Xnew
        m (array):
        
    Returns:
        dictionary (dict): four key-value pairs (ypreds, resids, mse, r2)
    
    """
    ypreds = m[0] + (m[1] * Xnew)
    resids = ynew - ypreds
    mse = (resids**2).sum()/resids.size
    r2 = r2_score(ynew, ypreds)
    
    dictionary = {'ypreds': ypreds,
                  'resids':resids,
                  'mse':mse,
                  'r2':r2
                 }
    return dictionary

In [103]:
linreg_predict(X ,Y,equation)

IndexError: index 2 is out of bounds for axis 0 with size 2

In [99]:
df['y_standardized'] = (df['y'] - df['y'].mean()) / df['y'].std()
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,y,y_standardized
0,2021-05-28,125.570000,125.800003,124.550003,124.610001,122.516022,71311100,142.741982,-1.728603
1,2021-06-01,125.080002,125.349998,123.940002,124.279999,122.191574,67637100,142.795646,-1.724006
2,2021-06-02,124.279999,125.239998,124.050003,125.059998,122.958450,59278900,142.849310,-1.719408
3,2021-06-03,124.680000,124.849998,123.129997,123.540001,121.463974,76229200,142.902975,-1.714811
4,2021-06-04,124.070000,126.160004,123.849998,125.889999,123.774483,75169300,142.956639,-1.710214
...,...,...,...,...,...,...,...,...,...
748,2024-05-20,189.330002,191.919998,189.009995,191.039993,191.039993,44361300,182.882962,1.710214
749,2024-05-21,191.089996,192.729996,190.919998,192.350006,192.350006,42309400,182.936627,1.714811
750,2024-05-22,192.270004,192.820007,190.270004,190.899994,190.899994,34648500,182.990291,1.719408
751,2024-05-23,190.979996,191.000000,186.630005,186.880005,186.880005,51005900,183.043956,1.724006


In [105]:
X = pd.DataFrame(np.array([df.index, df['Adj Close']]))

Y = np.array(df_reset['Close'])
#equation = line_of_best_fit(X, Y)

data = pd.DataFrame(add_bias_column(X.T))

m = line_of_best_fit(X.T, Y)

linreg_predict(X, Y, line_of_best_fit(X.T, Y))

ValueError: Found input variables with inconsistent numbers of samples: [753, 2]