In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import yfinance as yf

from sqlalchemy import create_engine, Table, Column, Integer,Float, String, MetaData 


In [2]:
# Download stock price data
def download_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

# Preprocess data and create input sequences
def preprocess_data(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length):
        sequence = data[i:i+sequence_length].values
        target = data[i+sequence_length]
        sequences.append((sequence, target))
    return sequences

# Split data into training and testing sets
def split_data(data, test_size=0.2):
    train_data, test_data = train_test_split(data, test_size=test_size, shuffle=False)
    return train_data, test_data

# Train linear regression model
def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model
 
# Evaluate model on test data
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    return mse

In [3]:
# Create a list to append the tickers dataframes 
# Usar en caso de trabajar con mas de un ticker
dfs = []

In [7]:
# Download stock data
ticker = 'goog'

start_date = '2024-08-05'
end_date = '2024-08-06'
stock_data = download_stock_data(ticker, start_date, end_date)


[*********************100%%**********************]  1 of 1 completed


In [8]:
df = (
    pd.DataFrame(stock_data)
    .reset_index()
    .rename(
        columns={
            "Date": "date",
            "Open": "open",
            "High": "high",
            "Low": "low",
            "Close": "close",
            "Adj Close": "adj_close",
            "Volume": "volume",
        }
    )
)
df["symbol"] = ticker
df = df[["date", "symbol", "open", "high", "low", "close", "adj_close", "volume"]]
df = df.set_index("date")

In [9]:
# Display the first few rows of the DataFrame
df.sort_values(by=['date'], inplace=True, ascending=False)
df.head()

Unnamed: 0_level_0,symbol,open,high,low,close,adj_close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-08-05,goog,157.369995,165.940002,156.600006,160.639999,160.639999,34907800


In [16]:
# Usar en caso de trabajar con mas de un ticker
dfs.append(df)
dfs

[           symbol        open        high         low       close   adj_close  \
 date                                                                            
 2020-01-02   goog   67.077499   68.406998   67.077499   68.368500   68.290787   
 2020-01-03   goog   67.392998   68.625000   67.277199   68.032997   67.955666   
 2020-01-06   goog   67.500000   69.824997   67.500000   69.710503   69.631264   
 2020-01-07   goog   69.897003   70.149498   69.518997   69.667000   69.587814   
 2020-01-08   goog   69.603996   70.579002   69.542000   70.216003   70.136192   
 ...           ...         ...         ...         ...         ...         ...   
 2024-07-24   goog  175.389999  177.949997  173.570007  174.369995  174.369995   
 2024-07-25   goog  174.250000  175.199997  169.050003  169.160004  169.160004   
 2024-07-26   goog  168.770004  169.839996  165.865005  168.679993  168.679993   
 2024-07-29   goog  170.500000  172.160004  169.720001  171.130005  171.130005   
 2024-07-30   go

In [17]:
# Usar en caso de trabajar con mas de un ticker
df = pd.concat(dfs, axis=0)
df.head()

Unnamed: 0_level_0,symbol,open,high,low,close,adj_close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-02,goog,67.077499,68.406998,67.077499,68.3685,68.290787,28132000
2020-01-03,goog,67.392998,68.625,67.277199,68.032997,67.955666,23728000
2020-01-06,goog,67.5,69.824997,67.5,69.710503,69.631264,34646000
2020-01-07,goog,69.897003,70.149498,69.518997,69.667,69.587814,30054000
2020-01-08,goog,69.603996,70.579002,69.542,70.216003,70.136192,30560000


In [12]:
# Save historical data to csv 
df.to_csv('stocks_goog_20200101_to_20240731.csv')

In [5]:
# Create connection to postgres instance
db_uri = 'postgresql://postgres:postgres@postgres:5432/stocks'
engine = create_engine(db_uri)

In [6]:
# Insert historical data into postgres table 'stock_ohlc'
df.to_sql('stock_ohlc', con=engine, if_exists='append', index=True) 

152

In [40]:
stock_data.reset_index(inplace=True)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2024-08-01,171.979996,175.679993,170.509995,172.449997,172.449997,17177800


In [41]:
stock_data['Date'] = stock_data['Date'].astype(str)
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2024-08-01,171.979996,175.679993,170.509995,172.449997,172.449997,17177800


In [49]:
stock_data_json = stock_data.to_json(orient='records')
stock_data_json

'[{"Date":"2024-08-01","Open":171.9799957275,"High":175.6799926758,"Low":170.5099945068,"Close":172.4499969482,"Adj Close":172.4499969482,"Volume":17177800}]'

In [57]:
df = pd.read_json(stock_data_json).T
df.head()

  df = pd.read_json(stock_data_json).T


Unnamed: 0,0
Date,2024-08-01 00:00:00
Open,171.979996
High,175.679993
Low,170.509995
Close,172.449997


In [17]:
data = stock_data.to_json()
data

'{"Date":{"0":1722816000000},"Open":{"0":154.2100067139},"High":{"0":162.9600067139},"Low":{"0":151.6100006104},"Close":{"0":161.0200042725},"Adj Close":{"0":161.0200042725},"Volume":{"0":82961700}}'

In [8]:
stock_data.describe()

count    1043.000000
mean      109.024967
std        25.535614
min        52.646080
25%        88.299908
50%       111.823158
75%       132.402542
max       153.334015
Name: Adj Close, dtype: float64

In [17]:
# Preprocess data
sequence_length = 10
data_sequences = preprocess_data(stock_data, sequence_length)

In [18]:
data_sequences

[]

In [16]:
# Split data into training and testing sets
train_data, test_data = split_data(data_sequences)

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
# Prepare training data
X_train = np.array([item[0] for item in train_data])
y_train = np.array([item[1] for item in train_data])

In [17]:
X_train

array([[ 68.35554504,  67.99795532,  69.81037903, ...,  71.91896057,
         71.44750214,  71.87750244],
       [ 67.99795532,  69.81037903,  69.67553711, ...,  71.44750214,
         71.87750244,  72.42488098],
       [ 69.81037903,  69.67553711,  70.17146301, ...,  71.87750244,
         72.42488098,  73.8911972 ],
       ...,
       [105.22922516, 104.52004242, 107.30684662, ..., 105.16929626,
        105.28916168, 105.84851837],
       [104.52004242, 107.30684662, 108.74519348, ..., 105.28916168,
        105.84851837, 103.7309494 ],
       [107.30684662, 108.74519348, 105.84851837, ..., 105.84851837,
        103.7309494 , 103.59111023]])

In [18]:
y_train

array([ 72.42488098,  73.8911972 ,  74.02754211,  74.10844421,
        74.1493988 ,  73.22446442,  71.50444031,  72.44186401,
        72.75150299,  72.62914276,  71.55687714,  74.04501343,
        72.18765259,  72.21961212,  73.71389771,  73.87071991,
        75.3465271 ,  75.41644287,  75.84445953,  75.58275604,
        75.84944916,  75.88491058,  76.15609741,  75.76255035,
        74.08796692,  70.91161346,  69.23654175,  69.44380188,
        65.67212677,  66.88573456,  69.23654175,  66.80932617,
        69.00080872,  65.6626358 ,  64.71273041,  60.7198143 ,
        63.68540955,  60.47558975,  55.51378632,  60.64390182,
        53.58849716,  55.83891296,  54.49695587,  55.51977921,
        53.34926987,  52.64608002,  56.43572998,  55.0178566 ,
        58.07934189,  55.44935989,  57.24979401,  58.03089905,
        55.04182816,  55.78747559,  54.57236481,  59.09168243,
        59.060215  ,  60.28081512,  60.25933838,  60.45112228,
        63.1889801 ,  62.79293442,  62.79942322,  63.87

In [None]:
# Train linear regression model
model = train_model(X_train, y_train)

In [None]:
# Prepare test data
X_test = np.array([item[0] for item in test_data])
y_test = np.array([item[1] for item in test_data])

In [26]:
# Last test sequence
X_test[-1]

array([145.74273682, 148.82919312, 147.36087036, 144.97361755,
       145.77270508, 142.6063385 , 140.35891724, 140.95822144,
       142.38658142, 143.92481995])

In [None]:
last_sequence = X_test[-1].reshape(1,-1)
last_sequence

In [6]:
predicted_price = model.predict(last_sequence)[0]
predicted_price

NameError: name 'model' is not defined

In [7]:
# Evaluate model using Mean Squared Error
mse = evaluate_model(model, X_test, y_test)

# Evaluate model using R-squared
r2 = r2_score(y_test, model.predict(X_test))

NameError: name 'model' is not defined

In [35]:
# Calculate Adjusted R-squared (adjusts for number of features)
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features

print(f"Number of samples: {n}")
print(f"Number of features: {p}")

Number of samples: 207
Number of features: 10


In [36]:
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Print the R-squared and adjusted R-squared
print(f'\nR-squared: {r2:.4f}')
print(f'Adjusted R-squared: {adjusted_r2:.2f}')
print(f'Mean Squared Error on Test Data: {mse: 0.2f}')


R-squared: 0.9463
Adjusted R-squared: 0.94
Mean Squared Error on Test Data:  5.03
