# MovieLens AutoInt – Training Notebook

이 노트북은 아래 구조를 기준으로 학습하고 **아티팩트**를 저장합니다.

```
autoint_project/
├─ autoint.py                # 모델 정의 (이미 생성됨)
├─ show_st.py                # Streamlit 앱 (이미 생성됨)
├─ data/
│  ├─ field_dims.npy         # ← 이 노트북이 저장
│  ├─ label_encoders.pkl     # ← 이 노트북이 저장
│  └─ ml-1m/
│     ├─ movies_prepro.csv   # ← 미리 위치
│     ├─ ratings_prepro.csv  # ← 미리 위치
│     └─ users_prepro.csv    # ← 미리 위치
└─ model/
   └─ autoInt_model_weights.h5  # ← 이 노트북이 저장
```
학습이 끝나면 `show_st.py`를 통해 Streamlit에서 바로 로드해 결과를 확인할 수 있습니다.

In [2]:
import os, sys, json, math, joblib
from pathlib import Path
import numpy as np
import pandas as pd

# 경로 설정
BASE_DIR = Path(__file__).resolve().parent if '__file__' in globals() else Path.cwd()
PROJ_DIR = BASE_DIR  # 노트북이 autoint_project 폴더 안에 있다고 가정
DATA_DIR = PROJ_DIR / 'data'
ML_DIR   = DATA_DIR / 'ml-1m'
MODEL_DIR= PROJ_DIR / 'model'

DATA_DIR.mkdir(exist_ok=True, parents=True)
ML_DIR.mkdir(exist_ok=True, parents=True)
MODEL_DIR.mkdir(exist_ok=True, parents=True)

print('PROJECT:', PROJ_DIR)
print('DATA:', DATA_DIR)
print('MODEL:', MODEL_DIR)

# autoint.py import를 위해 sys.path에 프로젝트 폴더 추가
if str(PROJ_DIR) not in sys.path:
    sys.path.append(str(PROJ_DIR))

from autoint import AutoIntModel  # 모델 클래스만 사용 (predict_model은 서비스 측에서 사용)


PROJECT: c:\Users\Administrator\Desktop\recommendation_system
DATA: c:\Users\Administrator\Desktop\recommendation_system\data
MODEL: c:\Users\Administrator\Desktop\recommendation_system\model


In [3]:
# 데이터 로드
movies = pd.read_csv(ML_DIR / 'movies_prepro.csv')
ratings = pd.read_csv(ML_DIR / 'ratings_prepro.csv')
users = pd.read_csv(ML_DIR / 'users_prepro.csv')

display(movies.head(2))
display(ratings.head(2))
display(users.head(2))


Unnamed: 0,movie_id,title,movie_year,movie_decade,genre1,genre2,genre3,genre4,genre5,genre6
0,1,Toy Story,1995,1990s,Animation,Children's,Comedy,,,
1,2,Jumanji,1995,1990s,Adventure,Children's,Fantasy,,,


Unnamed: 0,user_id,movie_id,rating,timestamp,rating_year,rating_month,rating_decade
0,1,1193,5,2001-01-01,2001,1,2000s
1,1,661,3,2001-01-01,2001,1,2000s


Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072


In [4]:
# 피처 칼럼 정의 (show_st.py와 동일 순서 유지)
FEATURE_COLS = ['user_id', 'movie_id', 'movie_decade', 'movie_year', 'rating_year',
                'rating_month', 'rating_decade', 'genre1', 'genre2', 'genre3',
                'gender', 'age', 'occupation', 'zip']

# 필요한 칼럼이 빠져 있으면 최소한의 규칙으로 생성/보정합니다.
def ensure_features(movies_df: pd.DataFrame, ratings_df: pd.DataFrame, users_df: pd.DataFrame):
    df = ratings_df.merge(movies_df, on='movie_id', how='left').merge(users_df, on='user_id', how='left')
    # movie_decade 없으면 movie_year로부터 생성
    if 'movie_decade' not in df.columns:
        if 'movie_year' in df.columns:
            df['movie_decade'] = (df['movie_year'] // 10 * 10).astype(str) + 's'
        else:
            df['movie_decade'] = '1990s'
    # rating_year / rating_month / rating_decade 보정
    for c in ['rating_year', 'rating_month']:
        if c not in df.columns:
            df[c] = 2000 if c=='rating_year' else 1
    if 'rating_decade' not in df.columns:
        df['rating_decade'] = (df['rating_year'] // 10 * 10).astype(str) + 's'
    # 장르 칼럼이 없다면 'no'로 채움
    for g in ['genre1','genre2','genre3']:
        if g not in df.columns:
            df[g] = 'no'
    # 유저 메타 보정
    for c in ['gender','age','occupation','zip']:
        if c not in df.columns:
            df[c] = 'unknown'
    # 없는 칼럼이 있으면 생성
    for c in FEATURE_COLS:
        if c not in df.columns:
            df[c] = 'no'
    keep = FEATURE_COLS + (['rating'] if 'rating' in df.columns else [])
    return df[keep]

train_df = ensure_features(movies, ratings, users)
print('train_df cols:', train_df.columns.tolist())
train_df.head(3)


train_df cols: ['user_id', 'movie_id', 'movie_decade', 'movie_year', 'rating_year', 'rating_month', 'rating_decade', 'genre1', 'genre2', 'genre3', 'gender', 'age', 'occupation', 'zip', 'rating']


Unnamed: 0,user_id,movie_id,movie_decade,movie_year,rating_year,rating_month,rating_decade,genre1,genre2,genre3,gender,age,occupation,zip,rating
0,1,1193,1970s,1975,2001,1,2000s,Drama,,,F,1,10,48067,5
1,1,661,1990s,1996,2001,1,2000s,Animation,Children's,Musical,F,1,10,48067,3
2,1,914,1960s,1964,2001,1,2000s,Musical,Romance,,F,1,10,48067,3


In [5]:
# LabelEncoder로 카테고리 -> 정수 인코딩
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in FEATURE_COLS:
    le = LabelEncoder()
    # 결측은 문자열 'no'로 대체해서 인코딩 안정화
    vals = train_df[col].fillna('no').astype(str)
    le.fit(vals)
    label_encoders[col] = le

# 인코딩 적용 함수
def transform_features(df):
    out = df.copy()
    for col, le in label_encoders.items():
        out[col] = le.transform(out[col].fillna('no').astype(str))
    return out

encoded = transform_features(train_df)
encoded.head(3)


Unnamed: 0,user_id,movie_id,movie_decade,movie_year,rating_year,rating_month,rating_decade,genre1,genre2,genre3,gender,age,occupation,zip,rating
0,0,189,6,55,1,0,0,7,17,15,0,0,2,1588,5
1,0,3374,8,76,1,0,0,2,2,8,0,0,2,1588,3
2,0,3615,5,44,1,0,0,11,12,15,0,0,2,1588,3


In [6]:
# 학습 타겟: 평점 기반 이진(>=4 → 1, 그 외 0). ‘rating’이 없는 경우 예시로 전부 1로 처리.
if 'rating' in encoded.columns:
    y = (encoded['rating'] >= 4).astype('int32').values
    X = encoded[FEATURE_COLS].astype('int32').values
else:
    X = encoded[FEATURE_COLS].astype('int32').values
    y = np.ones(len(X), dtype='int32')  # 임시(데모)

print('X shape:', X.shape)
print('y mean:', y.mean() if len(y) else None)


X shape: (1000209, 14)
y mean: 0.5751607913945985


In [7]:
# field_dims 계산 (각 필드의 고유 클래스 수) 및 아티팩트 저장
field_dims = np.array([ len(label_encoders[c].classes_) for c in FEATURE_COLS ], dtype='int32')
np.save(DATA_DIR / 'field_dims.npy', field_dims)
joblib.dump(label_encoders, DATA_DIR / 'label_encoders.pkl')
print('saved:', (DATA_DIR / 'field_dims.npy').as_posix(), (DATA_DIR / 'label_encoders.pkl').as_posix())


saved: c:/Users/Administrator/Desktop/recommendation_system/data/field_dims.npy c:/Users/Administrator/Desktop/recommendation_system/data/label_encoders.pkl


In [8]:
# 학습/검증 분리 및 모델 학습
from sklearn.model_selection import train_test_split
import tensorflow as tf

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

model = AutoIntModel(field_dims, embed_dim=16, att_layer_num=3, att_head_num=2, att_res=True,
                     dnn_hidden_units=[64,32], dnn_dropout=0.4)

# build 호출 (입력 텐서 한 번 통과)
_ = model(tf.convert_to_tensor(X_train[:1]))

model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.BinaryAccuracy(name='acc')])

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=2048,
    verbose=1
)
print({k: float(v[-1]) for k, v in history.history.items()})


Epoch 1/5
[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 209ms/step - acc: 0.6995 - auc: 0.7541 - loss: 0.5785 - val_acc: 0.7275 - val_auc: 0.7926 - val_loss: 0.5429
Epoch 2/5
[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 217ms/step - acc: 0.7296 - auc: 0.7927 - loss: 0.5412 - val_acc: 0.7305 - val_auc: 0.7984 - val_loss: 0.5337
Epoch 3/5
[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 214ms/step - acc: 0.7354 - auc: 0.8020 - loss: 0.5303 - val_acc: 0.7344 - val_auc: 0.8025 - val_loss: 0.5290
Epoch 4/5
[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 214ms/step - acc: 0.7396 - auc: 0.8089 - loss: 0.5222 - val_acc: 0.7321 - val_auc: 0.8042 - val_loss: 0.5304
Epoch 5/5
[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 216ms/step - acc: 0.7435 - auc: 0.8146 - loss: 0.5155 - val_acc: 0.7344 - val_auc: 0.8050 - val_loss: 0.5270
{'acc': 0.7435435652732849, 'auc': 0.8145784735679626, 'loss': 0.5154

In [10]:
# 가중치 저장 (show_st.py가 이 경로를 사용)
model.save_weights(MODEL_DIR / 'autoInt_model.weights.h5')
print('Saved:', (MODEL_DIR / 'autoInt_model.weights.h5').as_posix())


Saved: c:/Users/Administrator/Desktop/recommendation_system/model/autoInt_model.weights.h5


In [11]:
# 간단한 추론 스모크 테스트
pred = model(X_val[:10]).numpy().reshape(-1)
list(pred[:5])

[np.float32(0.31823024),
 np.float32(0.38736546),
 np.float32(0.6259775),
 np.float32(0.797723),
 np.float32(0.51909596)]