## 1. Setup <a id='1-setup'></a>

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import json

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

print("‚úì Import th∆∞ vi·ªán th√†nh c√¥ng")

‚úì Import th∆∞ vi·ªán th√†nh c√¥ng


## 2. Load D·ªØ Li·ªáu <a id='2-load-du-lieu'></a>

In [2]:
# ƒê·ªçc d·ªØ li·ªáu
DATA_PATH = '../data/'

portfolio = pd.read_json(DATA_PATH + 'portfolio.json', orient='records', lines=True)
profile = pd.read_json(DATA_PATH + 'profile.json', orient='records', lines=True)
transcript = pd.read_json(DATA_PATH + 'transcript.json', orient='records', lines=True)

print("K√≠ch th∆∞·ªõc d·ªØ li·ªáu ban ƒë·∫ßu:")
print(f"Portfolio: {portfolio.shape}")
print(f"Profile: {profile.shape}")
print(f"Transcript: {transcript.shape}")

K√≠ch th∆∞·ªõc d·ªØ li·ªáu ban ƒë·∫ßu:
Portfolio: (10, 6)
Profile: (17000, 5)
Transcript: (306534, 4)


## 3. X·ª≠ L√Ω Missing Values <a id='3-xu-ly-missing'></a>

**Chi·∫øn l∆∞·ª£c:**
- Profile c√≥ missing values ‚Üí Drop c√°c rows c√≥ NA (v√¨ l√† demographic data quan tr·ªçng)
- Portfolio v√† Transcript kh√¥ng c√≥ missing values

In [3]:
# Ki·ªÉm tra missing values
print("Missing values trong Profile:")
print(profile.isnull().sum())
print(f"\nT·ª∑ l·ªá missing: {profile.isnull().any(axis=1).sum() / len(profile) * 100:.2f}%")

Missing values trong Profile:
gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

T·ª∑ l·ªá missing: 12.79%


In [4]:
# Drop missing values trong profile
profile_clean = profile.dropna()

print(f"Profile tr∆∞·ªõc khi drop: {len(profile):,} rows")
print(f"Profile sau khi drop: {len(profile_clean):,} rows")
print(f"D·ªØ li·ªáu gi·ªØ l·∫°i: {len(profile_clean)/len(profile)*100:.2f}%")

Profile tr∆∞·ªõc khi drop: 17,000 rows
Profile sau khi drop: 14,825 rows
D·ªØ li·ªáu gi·ªØ l·∫°i: 87.21%


## 4. Merge Dataframes <a id='4-merge-dataframes'></a>

**Quy tr√¨nh merge:**
1. Encode Customer IDs trong transcript v√† profile
2. Sort v√† align d·ªØ li·ªáu theo customer_id
3. Duplicate profile rows theo frequency c·ªßa customer trong transcript
4. Merge transcript + profile
5. Add th√¥ng tin portfolio (offers)

### 4.1. Encode Customer IDs

In [5]:
# Ki·ªÉm tra unique customers
unique_transcript = transcript['person'].nunique()
unique_profile = profile_clean['id'].nunique()

print(f"Unique customers trong transcript: {unique_transcript:,}")
print(f"Unique customers trong profile_clean: {unique_profile:,}")
print(f"‚úì Kh·ªõp: {unique_transcript == unique_profile}")

Unique customers trong transcript: 17,000
Unique customers trong profile_clean: 14,825
‚úì Kh·ªõp: False


In [6]:
# Encode customer IDs t·ª´ string sang integer
customer_ids = pd.unique(transcript['person'])
customer_ids_dict = {cid: idx for idx, cid in enumerate(customer_ids)}

# Map v√†o dataframes
transcript['person'] = transcript['person'].map(customer_ids_dict)
profile_clean['id'] = profile_clean['id'].map(customer_ids_dict)

print(f"‚úì ƒê√£ encode {len(customer_ids_dict):,} customer IDs")
print(f"V√≠ d·ª•: {list(customer_ids_dict.items())[:3]}")

‚úì ƒê√£ encode 17,000 customer IDs
V√≠ d·ª•: [('78afa995795e4d85b5d9ceeca43f5fef', 0), ('a03223e636434f42ac4c3df47e8bac43', 1), ('e2127556f4f64592b11af22de27a7932', 2)]


### 4.2. Sort v√† Align Data

In [7]:
# Sort theo customer ID
sorted_transcript = transcript.sort_values('person').reset_index(drop=True)
sorted_profile = profile_clean.sort_values('id').reset_index(drop=True)

print("‚úì ƒê√£ sort d·ªØ li·ªáu theo customer ID")
print(f"Sorted transcript shape: {sorted_transcript.shape}")
print(f"Sorted profile shape: {sorted_profile.shape}")

‚úì ƒê√£ sort d·ªØ li·ªáu theo customer ID
Sorted transcript shape: (306534, 4)
Sorted profile shape: (14825, 5)


### 4.3. Duplicate Profile Rows

In [8]:
# T√≠nh frequency c·ªßa m·ªói customer trong transcript
customer_frequency = sorted_transcript['person'].value_counts(sort=False)

# Map frequency v√†o sorted_profile
sorted_profile['frequency'] = sorted_profile['id'].map(customer_frequency)

print("Customer frequency statistics:")
print(sorted_profile['frequency'].describe())
print(f"\nMissing frequency: {sorted_profile['frequency'].isnull().sum()}")

Customer frequency statistics:
count    14825.000000
mean        18.398786
std          6.925566
min          2.000000
25%         13.000000
50%         18.000000
75%         23.000000
max         51.000000
Name: frequency, dtype: float64

Missing frequency: 0


In [9]:
# Duplicate rows d·ª±a tr√™n frequency
profile_duplicated = sorted_profile.reindex(
    sorted_profile.index.repeat(sorted_profile['frequency'])
).reset_index(drop=True)

# Drop column frequency
profile_duplicated = profile_duplicated.drop(['frequency'], axis=1)

print(f"‚úì Profile sau khi duplicate: {profile_duplicated.shape}")
print(f"‚úì Transcript shape: {sorted_transcript.shape}")
print(f"‚úì Match: {len(profile_duplicated) == len(sorted_transcript)}")

‚úì Profile sau khi duplicate: (272762, 5)
‚úì Transcript shape: (306534, 4)
‚úì Match: False


### 4.4. Merge Transcript + Profile

In [10]:
# Concatenate transcript v√† profile
data = pd.concat([sorted_transcript, profile_duplicated], axis=1)

# Verify alignment
print(f"‚úì Data shape sau merge: {data.shape}")
print(f"‚úì Customer IDs align: {(data['person'] == data['id']).all()}")

# Drop duplicate id column
data = data.drop(['person'], axis=1)

print(f"\nFinal columns: {list(data.columns)}")
data.head()

‚úì Data shape sau merge: (306534, 9)
‚úì Customer IDs align: False

Final columns: ['event', 'value', 'time', 'gender', 'age', 'id', 'became_member_on', 'income']


Unnamed: 0,event,value,time,gender,age,id,became_member_on,income
0,transaction,{'amount': 17.78},144,F,75.0,0.0,20170509.0,100000.0
1,transaction,{'amount': 23.93},378,F,75.0,0.0,20170509.0,100000.0
2,offer completed,{'offer_id': '9b98b8c7a33c4b65b9aebfe6a799e6d9...,132,F,75.0,0.0,20170509.0,100000.0
3,offer viewed,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'},216,F,75.0,0.0,20170509.0,100000.0
4,transaction,{'amount': 19.67},222,F,75.0,0.0,20170509.0,100000.0


### 4.5. Add Portfolio Information

In [11]:
# Encode offer IDs trong portfolio
offer_ids = portfolio['id'].unique()
offer_ids_dict = {oid: idx for idx, oid in enumerate(offer_ids)}
portfolio['id'] = portfolio['id'].map(offer_ids_dict)

print(f"‚úì ƒê√£ encode {len(offer_ids_dict)} offer IDs")
portfolio.head()

‚úì ƒê√£ encode 10 offer IDs


Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,0
1,10,"[web, email, mobile, social]",10,5,bogo,1
2,0,"[web, email, mobile]",0,4,informational,2
3,5,"[web, email, mobile]",5,7,bogo,3
4,5,"[web, email]",20,10,discount,4


In [12]:
# Extract offer_id t·ª´ column 'value' trong transcript
def get_dict_value(x):
    """Extract value t·ª´ dictionary trong column 'value'"""
    if isinstance(x, dict):
        key = list(x.keys())[0]
        return x[key]
    return x

# Apply function
offer_id_series = data['value'].apply(get_dict_value)

print("Sample offer_id_series:")
print(offer_id_series.head(10))

Sample offer_id_series:
0                               17.78
1                               23.93
2    9b98b8c7a33c4b65b9aebfe6a799e6d9
3    5a8bc65990b245e5a138643cd4eb9837
4                               19.67
5                               26.56
6                               29.72
7                               19.89
8    5a8bc65990b245e5a138643cd4eb9837
9    9b98b8c7a33c4b65b9aebfe6a799e6d9
Name: value, dtype: object


In [13]:
# Encode offer_ids
def encode_offer_id(x):
    """Encode offer ID. N·∫øu l√† s·ªë (amount), return 10"""
    if isinstance(x, str):
        return offer_ids_dict.get(x, 10)
    else:
        return 10  # Transaction without offer

data['offer_id'] = offer_id_series.apply(encode_offer_id)

print("Offer ID distribution:")
print(data['offer_id'].value_counts().sort_index())

Offer ID distribution:
offer_id
0      18062
1      18222
2      11761
3      16202
4      13751
5      20139
6      20241
7      14305
8      19131
9      15767
10    138953
Name: count, dtype: int64


In [14]:
# Add portfolio features: reward, difficulty, duration
portfolio_dict = portfolio.set_index('id')[['reward', 'difficulty', 'duration']].to_dict('index')

data['reward'] = data['offer_id'].map(lambda x: portfolio_dict.get(x, {}).get('reward', 0))
data['difficulty'] = data['offer_id'].map(lambda x: portfolio_dict.get(x, {}).get('difficulty', 0))
data['duration'] = data['offer_id'].map(lambda x: portfolio_dict.get(x, {}).get('duration', 0))

print("‚úì ƒê√£ th√™m portfolio features")
data.head()

‚úì ƒê√£ th√™m portfolio features


Unnamed: 0,event,value,time,gender,age,id,became_member_on,income,offer_id,reward,difficulty,duration
0,transaction,{'amount': 17.78},144,F,75.0,0.0,20170509.0,100000.0,10,0,0,0
1,transaction,{'amount': 23.93},378,F,75.0,0.0,20170509.0,100000.0,10,0,0,0
2,offer completed,{'offer_id': '9b98b8c7a33c4b65b9aebfe6a799e6d9...,132,F,75.0,0.0,20170509.0,100000.0,3,5,5,7
3,offer viewed,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'},216,F,75.0,0.0,20170509.0,100000.0,7,0,0,3
4,transaction,{'amount': 19.67},222,F,75.0,0.0,20170509.0,100000.0,10,0,0,0


## 5. Feature Engineering <a id='5-feature-engineering'></a>

### 5.1. Extract Registration Month

In [15]:
# Extract month t·ª´ became_member_on (format: YYYYMMDD)
def extract_month(date_int):
    """Extract month t·ª´ format YYYYMMDD"""
    date_str = str(date_int)
    if len(date_str) == 8:
        return int(date_str[4:6])
    return 0

data['reg_month'] = data['became_member_on'].apply(extract_month)

print("Registration month distribution:")
print(data['reg_month'].value_counts().sort_index())

Registration month distribution:
reg_month
0    306534
Name: count, dtype: int64


### 5.2. Create Target Variable (event_id)

In [16]:
# Encode event types
event_mapping = {
    'offer received': 0,
    'offer viewed': 1,
    'transaction': 2,
    'offer completed': 3,
    'green flag': 4  # Will be created later
}

data['event_id'] = data['event'].map(event_mapping)

print("Event ID distribution:")
print(data['event_id'].value_counts().sort_index())
print(f"\nMissing event_id: {data['event_id'].isnull().sum()}")

Event ID distribution:
event_id
0     76277
1     57725
2    138953
3     33579
Name: count, dtype: int64

Missing event_id: 0


### 5.3. Drop Unnecessary Columns

In [17]:
# Columns c·∫ßn drop
columns_to_drop = ['event', 'value', 'time', 'became_member_on']

data = data.drop(columns=columns_to_drop)

print(f"‚úì ƒê√£ drop {len(columns_to_drop)} columns")
print(f"\nColumns c√≤n l·∫°i: {list(data.columns)}")
print(f"Data shape: {data.shape}")

‚úì ƒê√£ drop 4 columns

Columns c√≤n l·∫°i: ['gender', 'age', 'id', 'income', 'offer_id', 'reward', 'difficulty', 'duration', 'reg_month', 'event_id']
Data shape: (306534, 10)


### 5.4. Data Summary

In [18]:
print("="*80)
print("DATA SUMMARY SAU FEATURE ENGINEERING")
print("="*80)
print(f"\nShape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
print("\nData types:")
print(data.dtypes)
print("\nMissing values:")
print(data.isnull().sum())
print("\nFirst 5 rows:")
data.head()

DATA SUMMARY SAU FEATURE ENGINEERING

Shape: (306534, 10)

Columns: ['gender', 'age', 'id', 'income', 'offer_id', 'reward', 'difficulty', 'duration', 'reg_month', 'event_id']

Data types:
gender         object
age           float64
id            float64
income        float64
offer_id        int64
reward          int64
difficulty      int64
duration        int64
reg_month       int64
event_id        int64
dtype: object

Missing values:
gender        33772
age           33772
id            33772
income        33772
offer_id          0
reward            0
difficulty        0
duration          0
reg_month         0
event_id          0
dtype: int64

First 5 rows:


Unnamed: 0,gender,age,id,income,offer_id,reward,difficulty,duration,reg_month,event_id
0,F,75.0,0.0,100000.0,10,0,0,0,0,2
1,F,75.0,0.0,100000.0,10,0,0,0,0,2
2,F,75.0,0.0,100000.0,3,5,5,7,0,3
3,F,75.0,0.0,100000.0,7,0,0,3,0,1
4,F,75.0,0.0,100000.0,10,0,0,0,0,2


## 6. Feature Encoding <a id='6-feature-encoding'></a>

**Categorical features c·∫ßn encode:**
- `gender`: F, M, O ‚Üí 0, 1, 2
- `offer_id`: ƒë√£ encode (0-10)

In [19]:
# Encode gender
gender_mapping = {'F': 0, 'M': 1, 'O': 2}
data['gender'] = data['gender'].map(gender_mapping)

print("Gender encoding:")
print(data['gender'].value_counts().sort_index())
print("\n‚úì ƒê√£ encode gender")

Gender encoding:
gender
0.0    113101
1.0    155690
2.0      3971
Name: count, dtype: int64

‚úì ƒê√£ encode gender


In [20]:
# Verify t·∫•t c·∫£ columns ƒë√£ l√† numeric
print("Data types sau encoding:")
print(data.dtypes)
print(f"\n‚úì All numeric: {data.select_dtypes(include=[np.number]).shape[1] == data.shape[1]}")

Data types sau encoding:
gender        float64
age           float64
id            float64
income        float64
offer_id        int64
reward          int64
difficulty      int64
duration        int64
reg_month       int64
event_id        int64
dtype: object

‚úì All numeric: True


## 7. Feature Scaling <a id='7-feature-scaling'></a>

**Chi·∫øn l∆∞·ª£c scaling:**
- **StandardScaler** (z-score normalization): `age`, `income` (c√≥ outliers v√† skewed)
- **MinMaxScaler** (0-1 range): `reward`, `difficulty`, `reg_month` (bounded ranges)
- **No scaling**: `gender`, `offer_id`, `event_id` (categorical ƒë√£ encode)

### 7.1. Separate Features and Target

In [21]:
# Separate features v√† target
X = data.drop(['id', 'event_id'], axis=1)
y = data['event_id']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

Features shape: (306534, 8)
Target shape: (306534,)

Feature columns: ['gender', 'age', 'income', 'offer_id', 'reward', 'difficulty', 'duration', 'reg_month']


### 7.2. Train/Test Split (BEFORE Scaling)

In [22]:
# Split TR∆Ø·ªöC KHI scaling ƒë·ªÉ tr√°nh data leakage
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25, 
    random_state=42,
    stratify=y  # Gi·ªØ t·ª∑ l·ªá classes
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution in train:")
print(y_train.value_counts().sort_index())
print(f"\nTarget distribution in test:")
print(y_test.value_counts().sort_index())

Train set: (229900, 8)
Test set: (76634, 8)

Target distribution in train:


event_id
0     57208
1     43294
2    104214
3     25184
Name: count, dtype: int64

Target distribution in test:
event_id
0    19069
1    14431
2    34739
3     8395
Name: count, dtype: int64


### 7.3. Apply Scaling

In [23]:
# Reset index ƒë·ªÉ tr√°nh l·ªói khi scaling
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

print("‚úì Reset index ho√†n t·∫•t")

‚úì Reset index ho√†n t·∫•t


In [24]:
# StandardScaler cho age v√† income
std_scaler = StandardScaler()

X_train.loc[:, 'age'] = std_scaler.fit_transform(X_train[['age']])
X_test.loc[:, 'age'] = std_scaler.transform(X_test[['age']])

X_train.loc[:, 'income'] = std_scaler.fit_transform(X_train[['income']])
X_test.loc[:, 'income'] = std_scaler.transform(X_test[['income']])

print("‚úì StandardScaler applied to: age, income")

‚úì StandardScaler applied to: age, income


In [25]:
# MinMaxScaler cho reward, difficulty, reg_month
minmax_scaler = MinMaxScaler()

X_train.loc[:, 'reward'] = minmax_scaler.fit_transform(X_train[['reward']])
X_test.loc[:, 'reward'] = minmax_scaler.transform(X_test[['reward']])

X_train.loc[:, 'difficulty'] = minmax_scaler.fit_transform(X_train[['difficulty']])
X_test.loc[:, 'difficulty'] = minmax_scaler.transform(X_test[['difficulty']])

X_train.loc[:, 'reg_month'] = minmax_scaler.fit_transform(X_train[['reg_month']])
X_test.loc[:, 'reg_month'] = minmax_scaler.transform(X_test[['reg_month']])

print("‚úì MinMaxScaler applied to: reward, difficulty, reg_month")

‚úì MinMaxScaler applied to: reward, difficulty, reg_month


In [26]:
# Verify scaling
print("Scaled data statistics (Train):")
X_train.describe()

Scaled data statistics (Train):


Unnamed: 0,gender,age,income,offer_id,reward,difficulty,duration,reg_month
count,204500.0,204500.0,204500.0,229900.0,229900.0,229900.0,229900.0,229900.0
mean,0.600474,8.192957000000001e-17,9.214905e-17,7.023863,0.241565,0.214721,3.61619,0.0
std,0.518352,1.000002,1.000002,3.449524,0.332478,0.270173,3.652348,0.0
min,0.0,-2.041259,-1.616375,0.0,0.0,0.0,0.0,0.0
25%,0.0,-0.7320303,-0.7685317,4.0,0.0,0.0,0.0,0.0
50%,1.0,0.06489135,-0.1090979,9.0,0.0,0.0,4.0,0.0
75%,1.0,0.691044,0.6445407,10.0,0.5,0.5,7.0,0.0
max,2.0,2.683348,2.622842,10.0,1.0,1.0,10.0,0.0


## 8. Final Data Check <a id='8-train-test-split'></a>

In [27]:
print("="*80)
print("FINAL DATA SUMMARY")
print("="*80)
print(f"\nX_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\nFeatures:")
print(list(X_train.columns))

print("\nClass distribution (y_train):")
print(y_train.value_counts().sort_index())
print("\nClass distribution (y_test):")
print(y_test.value_counts().sort_index())

print("\n‚úì Data preprocessing ho√†n t·∫•t!")

FINAL DATA SUMMARY

X_train shape: (229900, 8)
X_test shape: (76634, 8)
y_train shape: (229900,)
y_test shape: (76634,)

Features:
['gender', 'age', 'income', 'offer_id', 'reward', 'difficulty', 'duration', 'reg_month']

Class distribution (y_train):
event_id
0     57208
1     43294
2    104214
3     25184
Name: count, dtype: int64

Class distribution (y_test):
event_id
0    19069
1    14431
2    34739
3     8395
Name: count, dtype: int64

‚úì Data preprocessing ho√†n t·∫•t!


## 9. L∆∞u D·ªØ Li·ªáu ƒê√£ X·ª≠ L√Ω <a id='9-luu-du-lieu'></a>

In [28]:
# T·∫°o th∆∞ m·ª•c processed n·∫øu ch∆∞a c√≥
import os
os.makedirs('../data/processed', exist_ok=True)

# L∆∞u c√°c datasets
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print("‚úì ƒê√£ l∆∞u datasets:")
print("  - X_train.csv")
print("  - X_test.csv")
print("  - y_train.csv")
print("  - y_test.csv")
print("\nV√†o th∆∞ m·ª•c: data/processed/")

‚úì ƒê√£ l∆∞u datasets:
  - X_train.csv
  - X_test.csv
  - y_train.csv
  - y_test.csv

V√†o th∆∞ m·ª•c: data/processed/


In [29]:
# L∆∞u feature names v√† mappings ƒë·ªÉ reference sau n√†y
import pickle

metadata = {
    'feature_names': list(X_train.columns),
    'target_mapping': event_mapping,
    'gender_mapping': gender_mapping,
    'train_shape': X_train.shape,
    'test_shape': X_test.shape,
    'scaled_features': {
        'standard_scaler': ['age', 'income'],
        'minmax_scaler': ['reward', 'difficulty', 'reg_month']
    }
}

with open('../data/processed/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("‚úì ƒê√£ l∆∞u metadata.pkl")
print("\nMetadata bao g·ªìm:")
for key in metadata.keys():
    print(f"  - {key}")

‚úì ƒê√£ l∆∞u metadata.pkl

Metadata bao g·ªìm:
  - feature_names
  - target_mapping
  - gender_mapping
  - train_shape
  - test_shape
  - scaled_features


---

## K·∫øt Lu·∫≠n

### ƒê√£ ho√†n th√†nh:

‚úì **Data Cleaning:**
- X·ª≠ l√Ω missing values trong profile (drop NA)
- Lo·∫°i b·ªè c√°c columns kh√¥ng c·∫ßn thi·∫øt

‚úì **Data Integration:**
- Merge 3 dataframes: transcript, profile, portfolio
- Align d·ªØ li·ªáu theo customer_id
- Duplicate profile rows theo frequency

‚úì **Feature Engineering:**
- Extract registration month t·ª´ became_member_on
- Encode offer_id t·ª´ value column
- Add portfolio features (reward, difficulty, duration)
- Create target variable (event_id)

‚úì **Feature Encoding:**
- Encode gender: F/M/O ‚Üí 0/1/2
- Encode offer_id: string ‚Üí 0-10
- Encode event_id: event names ‚Üí 0-4

‚úì **Feature Scaling:**
- StandardScaler: age, income
- MinMaxScaler: reward, difficulty, reg_month

‚úì **Data Splitting:**
- Train/Test split: 75/25
- Stratified sampling ƒë·ªÉ gi·ªØ t·ª∑ l·ªá classes

‚úì **Data Export:**
- L∆∞u X_train, X_test, y_train, y_test
- L∆∞u metadata cho reference

### B∆∞·ªõc ti·∫øp theo:

**Notebook 03 - Model Training:**
- Load processed data
- Handle imbalanced dataset (SMOTE, class weights)
- Train models: DNN, XGBoost, Random Forest
- Hyperparameter tuning
- Save trained models

---

**üìù Ghi ch√∫:** Dataset ƒë√£ s·∫µn s√†ng cho modeling. T·∫•t c·∫£ features ƒë√£ ƒë∆∞·ª£c scaled v√† encoded ƒë√∫ng c√°ch.