In [2]:
import os

print("src path:", os.path.join(os.getcwd(), "src"))
print("Files in src:", os.listdir(os.path.join(os.getcwd(), "src")))

src path: /Users/hitakshikulhare/bootcamp_hitakshi_kulhare/src
Files in src: ['cleaning.ipynb', 'config.py', '.gitkeep', '.ipynb_checkpoints']


In [4]:
import sys
import os
import pandas as pd

# Automatically detect the current working directory of the notebook
notebook_dir = os.getcwd()

# Build the full path to the 'src' directory inside the current folder
src_path = os.path.join(notebook_dir, 'src')

# Add it to the system path (if not already there)

src_path = os.path.join(os.getcwd(), 'src')
if src_path not in sys.path:
    sys.path.append(src_path)


from cleaning import fill_missing_median, drop_missing, normalize_data

In [6]:
df = pd.read_csv('homework/homework5/data/raw/sample_20250819-202307.csv')
df.head()

Unnamed: 0,date,ticker,price
0,2025-05-09,AAPL,100.094117
1,2025-05-10,AAPL,98.836087
2,2025-05-11,AAPL,99.278867
3,2025-05-12,AAPL,99.008237
4,2025-05-13,AAPL,98.355203


In [12]:
# Feature Engineering
df['price_change'] = df['price'].diff()
df['pct_return'] = df['price'].pct_change()
df['rolling_mean_7'] = df['price'].rolling(window=7).mean()
df['rolling_std_7'] = df['price'].rolling(window=7).std()
df['price_vs_mean'] = df['price'] - df['rolling_mean_7']

df.replace([float('inf'), float('-inf')], float('nan'), inplace=True)

df = fill_missing_median(df, ['price_change', 'pct_return', 'rolling_mean_7', 'rolling_std_7', 'price_vs_mean'])

df = normalize_data(df, ['price', 'price_change', 'pct_return', 'rolling_mean_7', 'rolling_std_7', 'price_vs_mean'])

print("Normalized Price Column:")
print(df[['date', 'price']].head(10))

print("\nPrice Change:")
print(df[['date', 'price_change']].head(10))

print("\nPercentage Return:")
print(df[['date', 'pct_return']].head(10))

print("\n7-day Rolling Mean:")
print(df[['date', 'rolling_mean_7']].head(10))

print("\n7-day Rolling Std Dev (Volatility):")
print(df[['date', 'rolling_std_7']].head(10))

print("\nPrice vs Rolling Mean:")
print(df[['date', 'price_vs_mean']].head(10))

Normalized Price Column:
         date     price
0  2025-05-09  1.000000
1  2025-05-10  0.796900
2  2025-05-11  0.868384
3  2025-05-12  0.824692
4  2025-05-13  0.719265
5  2025-05-14  0.830729
6  2025-05-15  0.936978
7  2025-05-16  0.971127
8  2025-05-17  0.991386
9  2025-05-18  0.609865

Price Change:
         date  price_change
0  2025-05-09      0.567855
1  2025-05-10      0.289752
2  2025-05-11      0.735670
3  2025-05-12      0.548628
4  2025-05-13      0.448369
5  2025-05-14      0.800598
6  2025-05-15      0.792127
7  2025-05-16      0.675039
8  2025-05-17      0.652482
9  2025-05-18      0.000000

Percentage Return:
         date  pct_return
0  2025-05-09    0.528376
1  2025-05-10    0.462237
2  2025-05-11    0.632075
3  2025-05-12    0.550860
4  2025-05-13    0.505892
5  2025-05-14    0.669934
6  2025-05-15    0.654230
7  2025-05-16    0.601184
8  2025-05-17    0.592144
9  2025-05-18    0.356822

7-day Rolling Mean:
         date  rolling_mean_7
0  2025-05-09        0.707342
1

In [13]:
# Step 1: Fill missing prices with median
df = fill_missing_median(df, ['price'])

# Step 2: Drop any column with more than 50% missing values (optional here, but safe)
df = drop_missing(df, threshold=0.5)

# Step 3: Normalize only the 'price' column
df = normalize_data(df, ['price'])

In [14]:
df.isnull().sum()

date              0
ticker            0
price             0
price_change      0
pct_return        0
rolling_mean_7    0
rolling_std_7     0
price_vs_mean     0
dtype: int64

In [21]:
save_path = '/Users/hitakshikulhare/bootcamp_hitakshi_kulhare/data/processed'
os.makedirs(save_path, exist_ok=True)

# Save the file
csv_path = os.path.join(save_path, 'sample_data_cleaned.csv')
df.to_csv(csv_path, index=False)

print("Saved to:", csv_path)
print("Contents of data folder:", os.listdir(save_path))

Saved to: /Users/hitakshikulhare/bootcamp_hitakshi_kulhare/data/processed/sample_data_cleaned.csv
Contents of data folder: ['summary.json', 'summary.csv', 'sample_data_cleaned.csv', 'summary_plot.png']


In [20]:
print("File saved to:", os.path.abspath(os.path.join(save_path, 'sample_data_cleaned.csv')))

File saved to: /Users/hitakshikulhare/bootcamp_hitakshi_kulhare/data/sample_data_cleaned.csv
