# DeepFM

## Constantes

In [3]:
SEED = 42

## Importaciones

In [None]:
!pip install deepctr-torch torch pandas numpy scikit-learn recommenders

In [None]:
from recommenders.datasets import movielens
from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

## Preparación datos

In [86]:
path = "player_cards.csv"

In [None]:
n_users = 12624 + 36
random_state = SEED
ratings = pd.read_csv(path)
sampled_users = ratings["tag"].drop_duplicates().sample(n=n_users, random_state=random_state)
ratings = ratings[ratings["tag"].isin(sampled_users)]
card_cols = [col for col in ratings.columns if col != "tag"]
ratings = pd.melt(ratings, id_vars="tag", value_vars=card_cols, var_name="card", value_name="rating")
ratings.rename(columns={"tag": "player"}, inplace=True)

### Datos implícitos

In [88]:
df = ratings[ratings["rating"] >= 1].copy()
df["label"] = (df["rating"] == 2).astype(int)

In [89]:
df = df.rename(columns={"player": "userID", "card": "itemID"})

### Train y Test

In [90]:
train_rows = []
test_rows = []

skipped = 0

for uid, group in df.groupby("userID"):
    deck_cards = group[group["label"] == 1]
    other_cards = group[group["label"] == 0]

    if len(deck_cards) < 8:
        skipped += 1
        continue

    deck_cards = deck_cards.sample(frac=1, random_state=SEED)
    train_pos = deck_cards.iloc[:4]
    test_pos = deck_cards.iloc[4:]

    other_cards = other_cards.sample(frac=1, random_state=SEED)
    split_point = len(other_cards) // 2
    train_neg = other_cards.iloc[:split_point]
    test_neg = other_cards.iloc[split_point:]

    train_rows.append(pd.concat([train_pos, train_neg]))
    test_rows.append(pd.concat([test_pos, test_neg]))

df_train = pd.concat(train_rows).reset_index(drop=True)
df_test = pd.concat(test_rows).reset_index(drop=True)

print(f"Train: {df_train.shape}, Test: {df_test.shape}")
print(f"Usuarios descartados por mazo incompleto: {skipped}")
print(f"Usuarios finales: {len(df_train['userID'].unique())}")

Train: (666775, 4), Test: (674480, 4)
Usuarios descartados por mazo incompleto: 36
Usuarios finales: 12624


In [91]:
combined = pd.concat([df_train, df_test], ignore_index=True)

uid2idx = {u: i for i, u in enumerate(sorted(combined["userID"].unique()))}
iid2idx = {i: j for j, i in enumerate(sorted(combined["itemID"].unique()))}

df_train["user_id"] = df_train["userID"].map(uid2idx).astype(int)
df_train["item_id"] = df_train["itemID"].map(iid2idx).astype(int)
df_test["user_id"] = df_test["userID"].map(uid2idx).astype(int)
df_test["item_id"] = df_test["itemID"].map(iid2idx).astype(int)


n_users = len(uid2idx)
n_items = len(iid2idx)

In [92]:
user_feat = SparseFeat("user_id", vocabulary_size=n_users, embedding_dim=16)
item_feat = SparseFeat("item_id", vocabulary_size=n_items, embedding_dim=16)

In [93]:
linear_feature_columns = [user_feat, item_feat]
dnn_feature_columns = [user_feat, item_feat]
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [94]:
def build_X(df_):
    return {name: df_[name].values for name in feature_names if name in df_.columns}

In [95]:
X_train, y_train = build_X(df_train), df_train["label"].values

In [96]:
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED, stratify=df_test["label"])

X_val, y_val = build_X(df_val), df_val["label"].values
X_test, y_test = build_X(df_test), df_test["label"].values

## Entrenamiento modelo

In [97]:
model = DeepFM(
    linear_feature_columns, dnn_feature_columns,
    task='binary', dnn_hidden_units=(128,64), dnn_dropout=0.2, device="cpu"
)

In [98]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["auc"])

In [99]:
model.fit(X_train, y_train, batch_size=2048, epochs=10, verbose=2, validation_data=(X_val, y_val))

cpu
Train on 666775 samples, validate on 337240 samples, 326 steps per epoch
Epoch 1/10
16s - loss:  0.2592 - auc:  0.7748 - val_auc:  0.6551
Epoch 2/10
16s - loss:  0.2075 - auc:  0.8384 - val_auc:  0.6576
Epoch 3/10
15s - loss:  0.2062 - auc:  0.8393 - val_auc:  0.6569
Epoch 4/10
15s - loss:  0.2045 - auc:  0.8419 - val_auc:  0.6626
Epoch 5/10
14s - loss:  0.2013 - auc:  0.8474 - val_auc:  0.6654
Epoch 6/10
14s - loss:  0.1978 - auc:  0.8532 - val_auc:  0.6706
Epoch 7/10
16s - loss:  0.1954 - auc:  0.8574 - val_auc:  0.6684
Epoch 8/10
15s - loss:  0.1935 - auc:  0.8603 - val_auc:  0.6742
Epoch 9/10
14s - loss:  0.1917 - auc:  0.8634 - val_auc:  0.6695
Epoch 10/10
15s - loss:  0.1896 - auc:  0.8665 - val_auc:  0.6699


<tensorflow.python.keras.callbacks.History at 0x7f19f5440170>

## Evaluación modelo

In [100]:
def precision_recall_at_k(df_user, k):
    df_user = df_user.sort_values('score', ascending=False)
    topk = df_user.head(k)
    hits_k = int(topk['label'].sum())
    total_rel = int(df_user['label'].sum())

    prec = hits_k / k
    rec = hits_k / total_rel if total_rel > 0 else np.nan

    return prec, rec, total_rel

def macro_precision_recall_at_k(df, k):
    precs, recs = [], []
    for uid, df_u in df.groupby('user_id'):
        p, r, tot_rel = precision_recall_at_k(df_u, k)
        precs.append(p)
        if not np.isnan(r):
            recs.append(r)

    return float(np.mean(precs)), (float(np.mean(recs)) if len(recs) > 0 else np.nan)

In [102]:
y_scores = model.predict(X_test, batch_size=4096).reshape(-1)
test_eval = df_test[['user_id','item_id','label']].copy()
test_eval['score'] = y_scores

In [None]:
print(len(X_test["user_id"]) == len(df_test) == len(y_scores))

In [103]:
K_list = [2, 4, 6]
rows = []
for K in K_list:
    p, r = macro_precision_recall_at_k(test_eval, K)
    rows.append({
        'K': K,
        'Precision@K (macro)': p,
        'Recall@K (macro)': r,
    })

metrics_df = pd.DataFrame(rows)
metrics_df

Unnamed: 0,K,Precision@K (macro),Recall@K (macro)
0,2,0.140328,0.145185
1,4,0.13506,0.278429
2,6,0.132618,0.406843
