<a href="https://colab.research.google.com/github/kimjoengmin/TEAM2-PYTHON/blob/main/6_shooping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# ===== Cell 1: Imports =====
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [12]:
# ===== Cell 2: Helper Functions =====

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """컬럼명 및 문자열 값을 소문자_언더스코어 형식으로 정규화"""
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(' ', '_', regex=False)
          .str.replace(r'[^0-9a-zA-Z_]', '', regex=True)
    )
    for col in df.select_dtypes(include=['object', 'category']):
        df[col] = (
            df[col].astype(str)
                   .str.strip()
                   .str.replace(' ', '_', regex=False)
        )
    return df

In [13]:
# ===== Cell 3: Helper Functions =====
def drop_id_unnamed_and_missing(df: pd.DataFrame, missing_thresh: float = 0.5) -> pd.DataFrame:
    """ID/Unnamed 컬럼 및 결측치 비율이 높은 컬럼 제거"""
    df = df.copy()
    to_drop = df.columns[df.columns.str.contains(r'^(?:unnamed)|id$', case=False, regex=True)]
    df = df.drop(columns=to_drop)
    missing = df.isnull().mean()
    df = df.drop(columns=missing[missing > missing_thresh].index)
    return df

In [14]:
# ===== Cell 4: Helper Functions =====
def impute_missing(df: pd.DataFrame) -> pd.DataFrame:
    """수치형 중간값, 범주형 최빈값으로 결측치 대치"""
    df = df.copy()
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    cat_cols = df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    if num_cols:
        df[num_cols] = pd.DataFrame(
            SimpleImputer(strategy='median').fit_transform(df[num_cols]),
            columns=num_cols
        )
    if cat_cols:
        df[cat_cols] = pd.DataFrame(
            SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols]),
            columns=cat_cols
        )
    return df

In [15]:
# ===== Cell 5: Helper Functions =====
def remove_outliers(df: pd.DataFrame, max_removal: float = 0.2) -> pd.DataFrame:
    """IQR 기준 이상치 제거 (전체의 max_removal 이하만 제거)"""
    df = df.copy()
    total = len(df)
    for col in df.select_dtypes(include=np.number):
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        mask = df[col].between(Q1 - 1.5 * IQR, Q3 + 1.5 * IQR)
        if mask.sum() > (1 - max_removal) * total:
            df = df[mask]
    return df

In [16]:
# ===== Cell 6: Helper Functions =====
def drop_highly_correlated(df: pd.DataFrame, threshold: float = 0.95) -> pd.DataFrame:
    """상관계수 절댓값이 threshold 이상인 컬럼 제거"""
    df = df.copy()
    corr = df.select_dtypes(include=np.number).corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
    return df.drop(columns=to_drop)

In [17]:
# ===== Cell 7: 인코딩 및 스케일링 =====
def encode_and_normalize(X: pd.DataFrame, max_onehot: int = 10) -> pd.DataFrame:
    """범주형은 Label/OneHot, 수치형은 StandardScaler 적용"""
    X = X.copy()
    for col in X.select_dtypes(include=['object', 'category', 'bool']):
        nun = X[col].nunique()
        if nun <= 2:
            X[col] = LabelEncoder().fit_transform(X[col])
        elif nun <= max_onehot:
            ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
            arr = ohe.fit_transform(X[[col]])
            cols = [f"{col}_{cat}" for cat in ohe.categories_[0]]
            X[cols] = arr
            X = X.drop(columns=[col])
        else:
            X = X.drop(columns=[col])
    num_cols = X.select_dtypes(include=np.number).columns
    X[num_cols] = StandardScaler().fit_transform(X[num_cols])
    return X

In [22]:
# ===== Cell 8: 함수 프로세싱 =====
def aggregate_customer_level(df: pd.DataFrame) -> pd.DataFrame:
    """고객 ID 기준으로 집계"""
    df = df.copy()
    df['TotalPrice'] = df['Quantity'] * df['Price']  # 총 금액 계산

    agg_df = df.groupby('Customer ID').agg({
        'Invoice': 'nunique',          # 총 주문 수 (다른 주문 번호 수)
        'Quantity': 'sum',              # 총 구매 수량
        'TotalPrice': 'sum',             # 총 구매 금액
        'InvoiceDate': 'max',            # 마지막 구매일
        'Country': 'first'               # 국가 (대부분 동일할 것)
    }).reset_index()

    agg_df = agg_df.rename(columns={
        'Invoice': 'Total_Orders',
        'Quantity': 'Total_Quantity',
        'TotalPrice': 'Total_Spent',
        'InvoiceDate': 'Last_Purchase_Date'
    })

    return agg_df
def preprocess(input_file: str, target_col: str = None,
               missing_thresh: float = 0.5,
               max_removal: float = 0.2,
               corr_thresh: float = 0.95,
               max_onehot: int = 10) -> str:
    """CSV 로드 → 고객 집계 → target 분리 → feature 전처리 → processed CSV 저장"""

    df = pd.read_csv(input_file)

    # 🔥 주문 단위 데이터를 고객 단위로 집계
    df = aggregate_customer_level(df)

    # 🔥 target_col이 있을 때만 y 분리 (Segmentation 없으면 건너뜀)
    if target_col:
        df = df.dropna(subset=[target_col]).reset_index(drop=True)
        y = df[target_col]
        X = df.drop(columns=[target_col])
    else:
        X = df.copy()
        y = None

    X = standardize_columns(X)
    X = drop_id_unnamed_and_missing(X, missing_thresh)
    X = impute_missing(X)
    X = remove_outliers(X, max_removal)
    X = drop_highly_correlated(X, corr_thresh)
    X = encode_and_normalize(X, max_onehot)

    # 🔥 y가 존재할 때만 X와 y를 합쳐 저장
    if y is not None:
        y = y.loc[X.index].reset_index(drop=True)
        result = pd.concat([X.reset_index(drop=True), y], axis=1)
    else:
        result = X.reset_index(drop=True)

    base = os.path.splitext(os.path.basename(input_file))[0]
    out_file = f"processed_{base}.csv"
    result.to_csv(out_file, index=False)

    print(f"[INFO] Processed data saved to: {out_file}")
    return out_file

In [23]:
# ===== Cell 9: 메인 실행 코드 =====
input_file = '/content/6_shopping.csv'

processed_path = preprocess(input_file, '')


[INFO] Processed data saved to: processed_6_shopping.csv
