In [1]:
# install some packages
!pip install -q yfinance
!pip install -q yahoofinancials

In [2]:
# import libraries
import pandas as pd
import yfinance as yf
from yahoofinancials import YahooFinancials
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
# load data
data = yf.download('GOOG', 
                      start='2021-01-01', 
                      end='2023-05-29', 
                      progress=False,
)
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-04,87.876999,88.032501,85.392502,86.412003,86.412003,38038000
2021-01-05,86.250000,87.383499,85.900749,87.045998,87.045998,22906000
2021-01-06,85.131500,87.400002,84.949997,86.764503,86.764503,52042000
2021-01-07,87.002998,89.419998,86.852501,89.362503,89.362503,45300000
2021-01-08,89.399002,90.491997,88.676750,90.360497,90.360497,41012000
...,...,...,...,...,...,...
2023-05-22,123.510002,127.050003,123.449997,125.870003,125.870003,29760200
2023-05-23,124.930000,125.419998,123.050003,123.290001,123.290001,24477900
2023-05-24,121.879997,122.750000,120.750000,121.639999,121.639999,23087900
2023-05-25,125.209999,125.980003,122.900002,124.349998,124.349998,33812700


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 604 entries, 2021-01-04 to 2023-05-26
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       604 non-null    float64
 1   High       604 non-null    float64
 2   Low        604 non-null    float64
 3   Close      604 non-null    float64
 4   Adj Close  604 non-null    float64
 5   Volume     604 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 33.0 KB


In [15]:
# data['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], Close, 5, 0.2
def prepare_data(df,forecast_col,forecast_out,test_size):
    # set nan last 10 rows values in Close column
    label = df[forecast_col].shift(-forecast_out) #creating new column called label with the last 5 rows are nan
    # initionlize Close column array to X
    X = np.array(df[[forecast_col]]) #creating the feature array
    # set X values -1 o 1
    X = preprocessing.scale(X) #processing the feature array
    # select last 10 rows, separate from X and assign to X_lately
    X_lately = X[-forecast_out:] #creating the column i want to use later in the predicting method
    # remove last 10 rows from X
    X = X[:-forecast_out] # X that will contain the training and testing
    # drop last 10 rows from Close column
    label.dropna(inplace=True) #dropping na values
    # assign rest labels to y
    y = np.array(label)  # assigning Y
    print(X, y)
    print(X.shape, y.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_size, random_state=0) #cross validation

    response = [X_train,X_test , Y_train, Y_test , X_lately]
    return response

In [16]:
forecast_col = 'Close'
forecast_out = 10
test_size = 0.2

# Split dataset into training and testing
X_train, X_test, Y_train, Y_test , X_lately =prepare_data(data,forecast_col,forecast_out,test_size); #calling the method were the cross validation and data preperation is in

[[-1.70366057]
 [-1.66874002]
 [-1.68424477]
 [-1.54114653]
 [-1.48617689]
 [-1.59768656]
 [-1.65323486]
 [-1.6316158 ]
 [-1.67077771]
 [-1.68176626]
 [-1.53120477]
 [-1.26670973]
 [-1.25472986]
 [-1.2277407 ]
 [-1.23228463]
 [-1.18315322]
 [-1.42123733]
 [-1.33222752]
 [-1.40760471]
 [-1.22691453]
 [-1.15486933]
 [-0.76225804]
 [-0.78346402]
 [-0.68533861]
 [-0.69935659]
 [-0.72524433]
 [-0.69255435]
 [-0.69114953]
 [-0.66851183]
 [-0.61951784]
 [-0.60186489]
 [-0.63246171]
 [-0.67669115]
 [-0.77655126]
 [-0.76008251]
 [-0.69313259]
 [-0.86886575]
 [-0.85371862]
 [-0.73075226]
 [-0.74636752]
 [-0.88167179]
 [-0.82003732]
 [-0.65631134]
 [-0.88866691]
 [-0.81009514]
 [-0.80367826]
 [-0.63915384]
 [-0.78470327]
 [-0.77211743]
 [-0.70043069]
 [-0.70439638]
 [-0.85548148]
 [-0.83625814]
 [-0.84895449]
 [-0.80937907]
 [-0.83113597]
 [-0.83306356]
 [-0.8573267 ]
 [-0.8011447 ]
 [-0.80227385]
 [-0.76622373]
 [-0.57586704]
 [-0.33406517]
 [-0.33626801]
 [-0.26761069]
 [-0.22420741]
 [-0.16791

In [13]:
learner = LinearRegression() #initializing linear regression model

learner.fit(X_train,Y_train) #training the linear regression model

In [14]:
score=learner.score(X_test,Y_test)#testing the linear regression model
forecast= learner.predict(X_lately) #set that will contain the forecasted data
response={}#creting json object
response['test_score']=score
response['forecast_set']=forecast

print(response)

{'test_score': 0.8633954992157329, 'forecast_set': array([117.4253245 , 120.31134602, 121.59300535, 123.47398608,
       123.22503502, 125.64081488, 123.26191796, 121.74053008,
       124.23929127, 125.23510957])}
