#### Import libraries

In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
today = datetime.datetime.today()
today = today.strftime('%Y-%m-%d')

print("Today is " + str(today))

Today is 2025-02-10


#### Import data
Source: https://www.nasdaq.com/market-activity/stocks

In [5]:
data = pd.read_csv("pho.csv", parse_dates=["Date"])
data.head()

Unnamed: 0,Date,Close/Last,Volume,Open,High,Low
0,2024-12-12,69.92,48284,70.31,70.3362,69.92
1,2024-12-11,70.54,35156,70.92,71.1891,70.45
2,2024-12-10,70.52,70930,71.0,71.0,69.955
3,2024-12-09,71.48,51351,71.54,71.68,71.36
4,2024-12-06,71.22,66262,71.67,71.71,71.09


#### Transform Data

In [7]:
# All columns to float
cols = ['Close/Last', 'Open', 'High', 'Low']

# Remove dollar sign and float values
dollar_to_float = lambda x: float(str(x).replace("$", ""))

In [8]:
# Apply transformation to columns
for c in cols:
    data[c] = data[c].apply(dollar_to_float)

In [9]:
data["Low"] = data["Low"].round(2)

In [10]:
data.set_index("Date", inplace=True)

In [11]:
data.rename(columns={"Close/Last":"Close"}, inplace=True)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2024-12-12 to 2019-12-13
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   1258 non-null   float64
 1   Volume  1258 non-null   int64  
 2   Open    1258 non-null   float64
 3   High    1258 non-null   float64
 4   Low     1258 non-null   float64
dtypes: float64(4), int64(1)
memory usage: 59.0 KB


#### Data Cleaning

In [14]:
date_range = pd.date_range(start=min(data.index), end=max(data.index), freq="D")
data_filled = data.reindex(date_range, fill_value=np.nan)

In [15]:
# Fill missing dates with previous week's price
cols = ["Close", "Volume", "Open", "High", "Low"]

for c in cols:
    data_filled[c] = data_filled[c].ffill()

data_filled.head(10)

Unnamed: 0,Close,Volume,Open,High,Low
2019-12-13,38.0143,32234.0,38.11,38.26,37.96
2019-12-14,38.0143,32234.0,38.11,38.26,37.96
2019-12-15,38.0143,32234.0,38.11,38.26,37.96
2019-12-16,38.23,424985.0,38.24,38.35,38.16
2019-12-17,38.33,234159.0,38.3,38.35,38.13
2019-12-18,38.18,79375.0,38.37,38.37,37.95
2019-12-19,38.2938,51152.0,38.2,38.35,38.16
2019-12-20,38.6,72218.0,38.38,38.61,38.38
2019-12-21,38.6,72218.0,38.38,38.61,38.38
2019-12-22,38.6,72218.0,38.38,38.61,38.38


#### Export Finalized Data

In [17]:
processed_data = data_filled.to_csv("processed_data.csv")