In [3]:
from pathlib import Path
import sys, pandas as pd

# Let notebook import from /project/src
sys.path.append(str((Path.cwd().parent) / "src"))

# Import your function
from outliers_project import remove_outliers

# Paths
repo = Path.cwd().parent                 # .../project
proc_dir = repo / "data" / "processed"
in_path  = proc_dir / "IYR_processed_project.csv"   # made in Stage 6
out_path = proc_dir / "IYR_cleaned_project.csv"     # new file we’ll write

print("Input exists? ", in_path.exists(), " → ", in_path)
print("Will write to: ", out_path)

Input exists?  True  →  /Users/ivysingal/bootcamp_ivy_singal/project/data/processed/IYR_processed_project.csv
Will write to:  /Users/ivysingal/bootcamp_ivy_singal/project/data/processed/IYR_cleaned_project.csv


In [4]:
df = pd.read_csv(in_path)
print(df.shape)
print(df.columns.tolist())   # look for 'daily_return'
df.head()

(502, 7)
['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'daily_return']


Unnamed: 0,Date,Close,High,Low,Open,Volume,daily_return
0,2022-01-03,105.180428,106.54012397203245,103.64734454667442,106.31198666159982,13194000,
1,2022-01-04,104.979668,105.9834704567302,104.84278388218968,105.47244509923952,9363000,-0.001909
2,2022-01-05,101.867874,104.8610416475952,101.79487549400208,104.71503042018672,12110000,-0.029642
3,2022-01-06,101.849625,102.32414719106455,100.84582182871512,101.96825353255834,7920600,-0.000179
4,2022-01-07,101.165215,101.86787442636896,100.88232648743846,101.5667384528561,7883300,-0.00672


In [5]:
# drop NaNs first so z-score math is clean
df2 = df.dropna(subset=["daily_return"]).copy()

# z-score method, threshold 3 (you can change to method="iqr", threshold=1.5 later)
clean_df = remove_outliers(df2, "daily_return", method="zscore", threshold=3.0)

print("Rows before:", len(df2), " → after:", len(clean_df))
clean_df["daily_return"].describe()

Rows before: 501  → after: 497


count    497.000000
mean      -0.000314
std        0.013258
min       -0.042372
25%       -0.008389
50%       -0.001033
75%        0.008016
max        0.039843
Name: daily_return, dtype: float64

In [6]:
out_path.parent.mkdir(parents=True, exist_ok=True)
clean_df.to_csv(out_path, index=False)

print("Saved:", out_path)
print("Exists now? ", out_path.exists())

Saved: /Users/ivysingal/bootcamp_ivy_singal/project/data/processed/IYR_cleaned_project.csv
Exists now?  True
