In [92]:
import pandas as pd

data = {'order_id': [1, 2, 3, 4, 5],
        'user_id': [101, 102, 101, 103, 102],
        'order_date': ['2023-01-15 10:30', '2023-01-16 14:00', '2023-02-20 11:00', '2023-03-10 20:45', '2023-03-12 13:20'],
        'price': [100, 250, 80, 120, 300],
        'quantity': [2, 1, 3, 5, 2]}
df = pd.DataFrame(data)
df['order_date'] = pd.to_datetime(df['order_date'])

In [93]:
# '총 구매 금액' 파생 변수 생성
df['total_price'] = df['price'] * df['quantity']

print("--- Derived from numeric operations ---")
print(df[['price', 'quantity', 'total_price']])

--- Derived from numeric operations ---
   price  quantity  total_price
0    100         2          200
1    250         1          250
2     80         3          240
3    120         5          600
4    300         2          600


In [94]:
# 날짜/시간 관련 파생 변수 생성
df['order_year'] = df['order_date'].dt.year
df['order_month'] = df['order_date'].dt.month
df['order_dayofweek'] = df['order_date'].dt.dayofweek # 월요일=0, 일요일=6
df['order_hour'] = df['order_date'].dt.hour

print("\n--- Derived from datetime ---")
print(df[['order_date', 'order_year', 'order_month', 'order_dayofweek', 'order_hour']])


--- Derived from datetime ---
           order_date  order_year  order_month  order_dayofweek  order_hour
0 2023-01-15 10:30:00        2023            1                6          10
1 2023-01-16 14:00:00        2023            1                0          14
2 2023-02-20 11:00:00        2023            2                0          11
3 2023-03-10 20:45:00        2023            3                4          20
4 2023-03-12 13:20:00        2023            3                6          13


In [95]:
# 사용자별 총 구매 금액 및 평균 구매 금액 파생 변수 생성
df['user_total_spent'] = df.groupby('user_id')['total_price'].transform('sum')
df['user_avg_spent'] = df.groupby('user_id')['total_price'].transform('mean')

# 사용자별 구매 횟수 파생 변수 생성
df['user_order_count'] = df.groupby('user_id')['order_id'].transform('count')

print("\n--- Derived from group statistics ---")
print(df[['user_id', 'total_price', 'user_total_spent', 'user_avg_spent', 'user_order_count']])


--- Derived from group statistics ---
   user_id  total_price  user_total_spent  user_avg_spent  user_order_count
0      101          200               440             220                 2
1      102          250               850             425                 2
2      101          240               440             220                 2
3      103          600               600             600                 1
4      102          600               850             425                 2


In [90]:
import pandas as pd

df = pd.DataFrame({
    'item': ['TV', 'Phone', 'Laptop', 'TV', 'Phone'],
    'grade': ['A', 'C', 'B', 'B', 'A'],
    'status': ['new', 'used', 'new', 'refurbished', 'used'],
    'target': [100, 200, 150, 120, 180]   # 예시 타겟 변수
})

# status별 target 평균값으로 매핑
mean_map = df.groupby('status')['target'].mean()
df['status_target_enc'] = df['status'].map(mean_map)

print(df[['status', 'status_target_enc']])

        status  status_target_enc
0          new                125
1         used                190
2          new                125
3  refurbished                120
4         used                190


In [91]:
# status를 정수로 변환
le = LabelEncoder()
df['status_int'] = le.fit_transform(df['status'])

# 정수를 2진수 문자열로 변환 후, 각 자리 분리
max_bits = df['status_int'].max().bit_length()
for i in range(max_bits):
    df[f'status_bin_{i}'] = df['status_int'].apply(lambda x: (x >> i) & 1)

print(df[['status', 'status_int'] + [f'status_bin_{i}' for i in range(max_bits)]])

        status  status_int  status_bin_0  status_bin_1
0          new           0             0             0
1         used           2             0             1
2          new           0             0             0
3  refurbished           1             1             0
4         used           2             0             1
