# Stocks Price Prediction

In [15]:
import pandas as pd
import yfinance as yf
import numpy as np
from datetime import datetime
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.stattools import adfuller

In [2]:
# function to download and save the raw data
def get_data(ticker, start, end):
    raw_data = yf.download(ticker, start, end)
    os.makedirs(f"../datasets/{ticker}", exist_ok=True)
    raw_data.to_csv(f"../datasets/{ticker}/raw.csv")
    return raw_data

In [3]:
ticker = "GOOG"
end = datetime.now()
start = datetime(end.year - 10, end.month, end.day)
print(start)

2015-09-09 00:00:00


In [4]:
# downloading the data
raw_data = get_data(ticker, start, end)
raw_data.head()

  raw_data = yf.download(ticker, start, end)
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,GOOG,GOOG,GOOG,GOOG,GOOG
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2015-09-09,30.427776,31.113087,30.272836,30.849889,34042000
2015-09-10,30.85634,30.995886,30.363713,30.446646,38106000
2015-09-11,31.075836,31.076333,30.661174,30.776881,27470000
2015-09-14,30.950201,31.080309,30.760996,31.072364,34046000
2015-09-15,31.541155,31.717945,30.977015,31.122022,41688000


In [5]:
# quick clean up
def clean_up(data):
    clean_data = data.copy()
    clean_data.columns = clean_data.columns.get_level_values(0)
    clean_data.columns.name = None
    clean_data.to_csv(f"../datasets/{ticker}/clean.csv")

clean_up(raw_data)

In [6]:
data = pd.read_csv(f"../datasets/{ticker}/clean.csv")
data.head(10)

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2015-09-09,30.427776,31.113087,30.272836,30.849889,34042000
1,2015-09-10,30.85634,30.995886,30.363713,30.446646,38106000
2,2015-09-11,31.075836,31.076333,30.661174,30.776881,27470000
3,2015-09-14,30.950201,31.080309,30.760996,31.072364,34046000
4,2015-09-15,31.541155,31.717945,30.977015,31.122022,41688000
5,2015-09-16,31.582869,31.680698,31.401112,31.557543,25730000
6,2015-09-17,31.926519,32.323798,31.535196,31.672755,45494000
7,2015-09-18,31.248657,31.782504,31.137915,31.623096,102668000
8,2015-09-21,31.556051,31.608194,31.084281,31.504405,35770000
9,2015-09-22,30.922886,31.164234,30.562352,31.136921,51258000


In [7]:
data.shape

(2514, 6)

In [8]:
data.isnull().sum()

Date      0
Close     0
High      0
Low       0
Open      0
Volume    0
dtype: int64

In [9]:
data.duplicated().sum()

np.int64(0)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2514 entries, 0 to 2513
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2514 non-null   object 
 1   Close   2514 non-null   float64
 2   High    2514 non-null   float64
 3   Low     2514 non-null   float64
 4   Open    2514 non-null   float64
 5   Volume  2514 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 118.0+ KB


In [11]:
data.tail(10)

Unnamed: 0,Date,Close,High,Low,Open,Volume
2504,2025-08-25,208.973221,210.90149,205.766081,206.815146,22788800
2505,2025-08-26,207.764297,208.273852,206.265637,208.183921,23051000
2506,2025-08-27,208.024078,209.382865,206.205696,206.275641,15249000
2507,2025-08-28,212.180344,212.709869,207.414614,207.65439,20915700
2508,2025-08-29,213.33931,215.147691,210.781598,211.081333,24682200
2509,2025-09-02,211.800705,212.179364,206.775198,208.803384,28900100
2510,2025-09-03,230.893631,231.593003,224.91896,226.277746,72250300
2511,2025-09-04,232.45224,232.562142,226.557499,229.949472,32196000
2512,2025-09-05,234.959991,235.919141,232.17249,232.452238,26127700
2513,2025-09-08,234.160004,238.399994,233.729996,235.789993,23104200


In [12]:
data.describe()

Unnamed: 0,Close,High,Low,Open,Volume
count,2514.0,2514.0,2514.0,2514.0,2514.0
mean,92.621215,93.580157,91.62077,92.557377,29565750.0
std,48.223139,48.760893,47.67124,48.186284,14171690.0
min,29.542334,30.044397,29.268707,29.661021,6809800.0
25%,52.058874,52.671923,51.346372,52.094503,20651250.0
50%,75.55595,77.492695,75.00423,75.737956,26139000.0
75%,133.368328,134.884792,131.97933,133.317182,34093000.0
max,234.959991,238.399994,233.729996,235.789993,133078000.0


In [13]:
data["Date"] = pd.to_datetime(data["Date"])
data.head(10)

Unnamed: 0,Date,Close,High,Low,Open,Volume
0,2015-09-09,30.427776,31.113087,30.272836,30.849889,34042000
1,2015-09-10,30.85634,30.995886,30.363713,30.446646,38106000
2,2015-09-11,31.075836,31.076333,30.661174,30.776881,27470000
3,2015-09-14,30.950201,31.080309,30.760996,31.072364,34046000
4,2015-09-15,31.541155,31.717945,30.977015,31.122022,41688000
5,2015-09-16,31.582869,31.680698,31.401112,31.557543,25730000
6,2015-09-17,31.926519,32.323798,31.535196,31.672755,45494000
7,2015-09-18,31.248657,31.782504,31.137915,31.623096,102668000
8,2015-09-21,31.556051,31.608194,31.084281,31.504405,35770000
9,2015-09-22,30.922886,31.164234,30.562352,31.136921,51258000


In [14]:
# creating subplots
fig = make_subplots(
    rows=2, cols=1,
    shared_xaxes=True,
    row_heights=[0.7, 0.3],
    vertical_spacing=0.2
)

# candle stick chart
fig.add_trace(go.Candlestick(
    x = data["Date"],
    open = data["Open"],
    high = data["High"],
    low = data["Low"],
    close = data["Close"],
    name = "Price"
    ),
    row=1, col=1
)

# volume bars
fig.add_trace(go.Bar(
    x = data["Date"],
    y = data["Volume"],
    name = "volume",
    marker_color="#c1440e",
    opacity=1.0,
    ),
    row=2, col=1
)

# layout
fig.update_layout(
    title = "Candlesticks Chart With Volume",
    xaxis_rangeslider_visible = False,
    xaxis_title="Date",
    yaxis_title="Price",
    yaxis2_title="Volume",
    template="plotly_dark",
    showlegend = False,
    height=900
)

fig.show()

In [16]:
# adding log returns
data["Log_Returns"] = np.log(data["Close"] / data["Close"].shift(1))
data.head(10)

Unnamed: 0,Date,Close,High,Low,Open,Volume,Log_Returns
0,2015-09-09,30.427776,31.113087,30.272836,30.849889,34042000,
1,2015-09-10,30.85634,30.995886,30.363713,30.446646,38106000,0.013986
2,2015-09-11,31.075836,31.076333,30.661174,30.776881,27470000,0.007088
3,2015-09-14,30.950201,31.080309,30.760996,31.072364,34046000,-0.004051
4,2015-09-15,31.541155,31.717945,30.977015,31.122022,41688000,0.018914
5,2015-09-16,31.582869,31.680698,31.401112,31.557543,25730000,0.001322
6,2015-09-17,31.926519,32.323798,31.535196,31.672755,45494000,0.010822
7,2015-09-18,31.248657,31.782504,31.137915,31.623096,102668000,-0.021461
8,2015-09-21,31.556051,31.608194,31.084281,31.504405,35770000,0.009789
9,2015-09-22,30.922886,31.164234,30.562352,31.136921,51258000,-0.020269


In [17]:
data["Log_Returns"].isnull().sum()

np.int64(1)

In [21]:
data.shape

(2513, 7)

In [18]:
data.sample(10)

Unnamed: 0,Date,Close,High,Low,Open,Volume,Log_Returns
505,2017-09-11,46.13826,46.6001,46.030996,46.395006,25340000,0.002781
350,2017-01-30,39.843342,40.514746,39.718199,40.456149,64932000,-0.025825
2226,2024-07-16,184.448868,189.261438,184.071016,187.889268,12760100,-0.014397
1652,2022-03-31,138.700348,141.674987,138.670053,141.48032,29516000,-0.02122
84,2016-01-08,35.480694,36.412318,35.407696,36.323922,49018000,-0.016546
2106,2024-01-23,147.669449,147.848233,146.189586,146.715982,14113600,0.006545
1296,2020-10-30,80.499626,83.7767,79.677751,83.037261,86582000,0.033733
1798,2022-10-28,95.923569,96.201664,91.695499,91.901093,35696900,0.042083
982,2019-08-05,57.224396,58.362607,56.619534,58.104372,51950000,-0.035523
504,2017-09-08,46.010143,46.531076,45.929693,46.506249,20230000,-0.010148


In [19]:
data = data.dropna()
data.head(10)

Unnamed: 0,Date,Close,High,Low,Open,Volume,Log_Returns
1,2015-09-10,30.85634,30.995886,30.363713,30.446646,38106000,0.013986
2,2015-09-11,31.075836,31.076333,30.661174,30.776881,27470000,0.007088
3,2015-09-14,30.950201,31.080309,30.760996,31.072364,34046000,-0.004051
4,2015-09-15,31.541155,31.717945,30.977015,31.122022,41688000,0.018914
5,2015-09-16,31.582869,31.680698,31.401112,31.557543,25730000,0.001322
6,2015-09-17,31.926519,32.323798,31.535196,31.672755,45494000,0.010822
7,2015-09-18,31.248657,31.782504,31.137915,31.623096,102668000,-0.021461
8,2015-09-21,31.556051,31.608194,31.084281,31.504405,35770000,0.009789
9,2015-09-22,30.922886,31.164234,30.562352,31.136921,51258000,-0.020269
10,2015-09-23,30.9065,31.232768,30.789302,30.891106,29418000,-0.00053


In [20]:
data.shape

(2513, 7)