In [18]:
import numpy as np

embeddings = np.load("./res/embeddings/embeddings.npy")
labels = np.load("./res/embeddings/labels.npy")

n_samples, n_dims = embeddings.shape
print("num_samples:", n_samples)
print("n_dims:     ", n_dims)

num_samples: 3955
n_dims:      128


In [19]:
from sklearn.model_selection import train_test_split

player_ids = np.unique(labels)
player_ids_train, player_ids_excluded = train_test_split(player_ids, test_size=0.1, random_state=410)

print("num_players:        ", len(player_ids))
print("num_players [train]:", len(player_ids_train))
print("num_players [excl.]:", len(player_ids_excluded))

num_players:         84
num_players [train]: 75
num_players [excl.]: 9


In [24]:
embeddings_train = embeddings[np.isin(labels, player_ids_train)]
labels_train = labels[np.isin(labels, player_ids_train)]
print("embeddings [train]:", embeddings_train.shape)

embeddings_excluded = embeddings[np.isin(labels, player_ids_excluded)]
labels_excluded = labels[np.isin(labels, player_ids_excluded)]
print("embeddings [excl.]:", embeddings_excluded.shape)

embeddings [train]: (3538, 128)
embeddings [excl.]: (417, 128)


In [56]:
def generate_pair_samples(source_embeddings: np.ndarray, source_player_ids: np.ndarray, n: int, random_seed: int = 410):
    x = []
    y = []
    rng = np.random.default_rng(seed=410)
    for _ in range(n):
        if rng.random() < 0.5:
            # positive class
            player_id = rng.choice(source_player_ids)
            left, right = rng.choice(source_embeddings[source_player_ids == player_id], size=2, replace=False)
            y.append(1)
        else:
            # negative class
            player_id_left, player_id_right = rng.choice(source_player_ids, size=2, replace=False)
            left = rng.choice(source_embeddings[source_player_ids == player_id_left])
            right = rng.choice(source_embeddings[source_player_ids == player_id_right])
            y.append(0)
        x.append(np.concatenate((left, right), axis=0))
    x = np.array(x)
    y = np.array(y)
    return x, y

In [61]:
# sample pairwise data to train classifier on!
x_train, y_train = generate_pair_samples(embeddings_train, labels_train, 256_000)
print(x_train.shape, y_train.shape)

x_test, y_test = generate_pair_samples(embeddings_excluded, labels_excluded, 64_000)
print(x_test.shape, y_test.shape)

(256000, 256) (256000,)
(64000, 256) (64000,)


In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

clf = RandomForestClassifier(n_estimators=10)

clf.fit(x_train, y_train)

print(classification_report(y_train, clf.predict(x_train)))
print(classification_report(y_test, clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99    127774
           1       0.99      0.98      0.99    128226

    accuracy                           0.99    256000
   macro avg       0.99      0.99      0.99    256000
weighted avg       0.99      0.99      0.99    256000

              precision    recall  f1-score   support

           0       0.50      0.50      0.50     31941
           1       0.50      0.50      0.50     32059

    accuracy                           0.50     64000
   macro avg       0.50      0.50      0.50     64000
weighted avg       0.50      0.50      0.50     64000

