In [9]:
import sys
from pathlib import Path

# 假设notebook目录和src1目录同级
project_root = Path.cwd().parent
sys.path.append(str(project_root))

import pandas as pd
from src1.data_io import load_raw_data, save_processed_data
from src1.cleaning import fill_missing_median, drop_missing, normalize_data, remove_outliers_iqr

# 加载原始数据（上一步保存好的）
df_raw = load_raw_data("TSLA_data.csv")
df = df_raw.copy()
print("Raw data shape:", df.shape)
df.head()


[load_raw_data] Loaded from C:\Users\go199\Desktop\PY HW\bootcamp_Wu_Chuyu\project\notebook\data\raw\TSLA_data.csv
Raw data shape: (40, 6)


Unnamed: 0,Date,Close,High,Low,Open,Volume
0,,TSLA,TSLA,TSLA,TSLA,TSLA
1,2023-01-03,108.0999984741211,118.80000305175781,104.63999938964844,118.47000122070312,231402800
2,2023-01-04,113.63999938964844,114.58999633789062,107.5199966430664,109.11000061035156,180389000
3,2023-01-05,110.33999633789062,111.75,107.16000366210938,110.51000213623047,157986300
4,2023-01-06,113.05999755859375,114.38999938964844,101.80999755859375,103.0,220911100


In [None]:
# --- CLEANING AND STORAGE ---
df = df[df['Date'] != 'NaN'].copy()   # 删除第一行错误
df.reset_index(drop=True, inplace=True)

# --- 强制转换数值列为 float ---
numeric_cols = ["Open", "High", "Low", "Close", "Volume"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# (2) 删除缺失率大于50%的列（如果有）
df = drop_missing(df, threshold=0.5)

# (3) 填充缺失值（Open 和 Close）
df = fill_missing_median(df, ["Open", "Close"])

# (4) 去除成交量的异常值
df = remove_outliers_iqr(df, ["Volume"], k=1.5)

# (5) 归一化收盘价
df = normalize_data(df, ["Close"])

print("Data shape (after clean):", df.shape)
print(df.head())

# (6) 保存清洗后的数据
save_processed_data(df, "TSLA_cleaned.csv")

[drop_missing] Dropped columns: []
[fill_missing_median] Filled Open with median=173.89
[fill_missing_median] Filled Close with median=177.90
[remove_outliers_iqr] Volume: removed 2 rows
[normalize_data] Normalized Close
Data shape (after clean): (38, 6)
         Date     Close        High         Low        Open       Volume
1  2023-01-03  0.000000  118.800003  104.639999  118.470001  231402800.0
2  2023-01-04  0.052195  114.589996  107.519997  109.110001  180389000.0
3  2023-01-05  0.021104  111.750000  107.160004  110.510002  157986300.0
4  2023-01-06  0.046731  114.389999  101.809998  103.000000  220911100.0
5  2023-01-09  0.109949  123.519997  117.110001  118.959999  190284000.0
[save_processed_data] Saved to C:\Users\go199\Desktop\PY HW\bootcamp_Wu_Chuyu\project\notebook\data\processed\TSLA_cleaned.csv
