# 데이터 드리프트 전처리/학습/검증 파이프라인 구축

- JPark

## 작업 폴더 설정 

In [3]:
import os
os.chdir('../')

## data.csv

In [8]:
import pandas as pd
data = pd.read_csv("data/raw/data.csv")
data

Unnamed: 0,feature1,feature2,feature3,label
0,5.623620,2.603363,1.844675,0
1,7.352143,3.562709,2.392952,1
2,6.695982,3.581270,1.885052,1
3,6.295975,3.774860,1.443513,1
4,4.968056,3.952183,1.603179,0
...,...,...,...,...
295,6.066730,3.447923,2.024921,1
296,6.809981,3.835115,2.465739,1
297,5.147463,2.844640,5.481679,0
298,6.368671,2.884578,1.065010,1


## preprocess.py

In [9]:

import pandas as pd
from sklearn.model_selection import train_test_split

# 데이터 로드
data = pd.read_csv("data/raw/data.csv")

# 데이터 분할 (80% 학습, 20% 테스트)
train, test = train_test_split(data, test_size=0.2, random_state=42)

# 저장
train.to_csv("data/processed/train.csv", index=False)
test.to_csv("data/processed/test.csv", index=False)

print("✅ Data preprocessing complete! Train and test sets saved.")

✅ Data preprocessing complete! Train and test sets saved.


## train.py

In [10]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression

# 데이터 로드
train = pd.read_csv("data/processed/train.csv")

# 입력(X)과 출력(y) 분리
X_train = train.drop(columns=["label"])
y_train = train["label"]

# 모델 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 모델 저장
with open("models/model.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model training complete! Model saved as models/model.pkl")

✅ Model training complete! Model saved as models/model.pkl


## evaluate.py

In [11]:
import pandas as pd
import pickle
import json
from sklearn.metrics import accuracy_score

# 데이터 로드
test = pd.read_csv("data/processed/test.csv")

# 입력(X)과 출력(y) 분리
X_test = test.drop(columns=["label"])
y_test = test["label"]

# 모델 로드
with open("models/model.pkl", "rb") as f:
    model = pickle.load(f)

# 예측 및 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# 평가 결과 저장
metrics = {"accuracy": accuracy}
with open("metrics.json", "w") as f:
    json.dump(metrics, f)

print(f"✅ Model evaluation complete! Accuracy: {accuracy:.4f}")

✅ Model evaluation complete! Accuracy: 0.9833
