In [34]:
import pandas as pd

embeddings = pd.read_pickle('train_mles_embeddings.pickle')
test_embeddings = pd.read_pickle('test_mles_embeddings.pickle')

In [56]:
gowalla_all_ids = pd.read_pickle('gowalla_all_ids.pkl')

In [57]:
import torch
import numpy as np

In [58]:
from torch import nn
num_classes = 186

In [61]:
class DownstreamModel(nn.Module):
  def __init__(self):
    super(DownstreamModel, self).__init__()
    self.fc1 = nn.Linear(128, num_classes = 186)
    self.fc2 = nn.Linear(500, 500)
    self.fc3 = nn.Linear(500, 3883)
    self.fc4 = nn.Linear(186, 186)
    self.dropout = nn.Dropout(p=0.3)
    self.relu = nn.ReLU()

  def forward(self, x):
    x = self.fc1(x)
    return x

In [9]:
user_target = pd.read_csv('user_activity_target.csv')

In [50]:
user_activity = pd.read_csv('gowalla_user_activity.csv')

In [11]:
from collections import Counter
import json

test_probs_common = []
for i in range(user_activity.shape[0]):
    location_list = user_activity.loc[i, 'location_id_bin']
    location_list = json.loads(location_list)
    counter = Counter(location_list)
    test_probs_common.append(counter.most_common(1)[0][0])
    

In [68]:
test_probs_common = torch.tensor(test_probs_common, dtype = torch.int64)

In [None]:
true_pred  = [gowalla_all_ids[1][x] for x in target]

In [None]:
train_target = true_pred[:14869]
test_target = true_pred[14869:]

In [None]:
counter = 0
for item in test_target:
    if item not in train_target:
        counter += 1
print(counter)       

In [18]:
user_scores = np.array([0.3, 0.2, 0.4, 0.1])
print((-1 * user_scores).argsort())
print((-1 * user_scores).argsort().argsort())
print((-1 * user_scores).argsort().argsort()[0])

[2 0 1 3]
[1 2 0 3]
1


In [62]:
def apk(actual, predicted, k=10):

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(y_prob, y, k=10):
    predicted = [np.argsort(p_)[-k:][::-1] for p_ in y_prob]
    actual = [[y_] for y_ in y]
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])



def hits_k(y_prob, y, k=10):
    acc = []
    for p_, y_ in zip(y_prob, y):
        top_k = p_.argsort()[-k:][::-1]
        acc += [1. if y_ in top_k else 0.]
    return sum(acc) / len(acc)

In [63]:
from torch.utils.data import Dataset, DataLoader

In [89]:
class UserDataset():

    def __init__(self):
        embeddings = pd.read_pickle(r'train_mles_embeddings.pickle')
        embeddings_tensor = torch.tensor(embeddings.values, requires_grad = False)
        self.x = embeddings_tensor

        user_target = pd.read_csv('user_activity_target.csv')
        target = list(user_target['target_train'])
        true_pred  = [gowalla_all_ids[1][x] for x in target]
        self.y = true_pred


    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [15]:
class RandomDataset():
    def __init__(self):
        embeddings_tensor = torch.Tensor(size=[20001, 128])
        self.x = embeddings_tensor.random_(-1, 1)
        user_target = pd.read_csv('user_activity_target.csv')
        target = list(user_target['target_train'])
        true_pred  = [gowalla_all_ids[1][x] for x in target]
        self.y = true_pred


    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)
    

In [90]:
class UserTestDataset():
    def __init__(self):
        embeddings = pd.read_pickle(r'test_mles_embeddings.pickle')
        embeddings_tensor = torch.tensor(embeddings.values,requires_grad = False)
        self.x = embeddings_tensor

        user_target = pd.read_csv('user_activity_target.csv')
        target = list(user_target['target_test'])
        true_pred  = [gowalla_all_ids[1][x] for x in target]
        self.y = true_pred


    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [91]:
train_dataset = UserDataset()
print(train_dataset.__len__())

20001


In [92]:
test_dataset = UserTestDataset()
print(test_dataset.__len__())

20001


In [93]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)

In [94]:
test_dataloader = DataLoader(test_dataset, batch_size = 64, shuffle = False)

In [95]:
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import ndcg_score

In [96]:
labels = np.array(gowalla_all_ids[1].values())

In [97]:
labels = np.arange(0, num_classes)

In [98]:
def get_metrics_(probs, labels_batch, test_one_hot):
    hits1 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=1, labels = labels)
    hits5 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=5, labels = labels)
    hits10 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=10, labels = labels)
    hits20 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=20, labels = labels)
    hits50= top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=50, labels = labels)
    hits100 = top_k_accuracy_score(labels_batch, probs.cpu().detach().numpy(), k=100, labels = labels)

    map1 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=1)
    map5 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=5)
    map10 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=10)
    map20 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=20)
    map50 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=50)
    map100 = mapk(y_prob=probs.cpu().detach().numpy(), y = labels_batch, k=100)

    ndcg1 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=1)
    ndcg5 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=5)
    ndcg10 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=10)
    ndcg20 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=20)
    ndcg50 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=50)
    ndcg100 = ndcg_score(test_one_hot, probs.cpu().detach().numpy(), k=100)
    return hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100

In [76]:
def random_neq(l, r, s):
    t = np.random.randint(l, r)
    while t in s:
        t = np.random.randint(l, r)
    return t

In [77]:
def hits_k(yscores, y, k=10):
        acc, ndcg = 0., 0.
        for user, user_scores in zip(y, yscores):
            rank_actual = (-1 * user_scores).argsort().argsort()[0]
            if rank_actual < k:
                acc += 1.
                ndcg += 1. / np.log2(rank_actual + 2)

        return acc/float(len(y))

In [78]:
test_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7febc9fe0220>

In [69]:
user_target = pd.read_csv('user_activity_target.csv')
target = list(user_target['target_test'])
true_pred  = [gowalla_all_ids[1][x] for x in target]

test_probs_common = []
for i in range(user_activity.shape[0]):
    location_list = user_activity.loc[i, 'location_id_bin']
    location_list = json.loads(location_list)
    counter = Counter(location_list)
    test_probs_common.append(counter.most_common(1)[0][0])
test_probs_common = torch.tensor(test_probs_common, dtype = torch.int64)
test_probs = torch.zeros(20001, 3883)
test_probs[torch.arange(len(test_probs)), test_probs_common] = 1
test_one_hot = torch.zeros(20001, 3883)
test_one_hot[torch.arange(20001), true_pred] = 1

hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100 = get_metrics_(test_probs, true_pred)
print_metrics(hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100)

hits@1: 0.076696, hits@5: 0.076696, hits@10: 0.076696, hits@20: 0.076846
hits@50: 0.080296, hits@100: 0.085696
map@1: 0.076696, map@5: 0.076825, map@10: 0.076853, map@20: 0.077173
map@50: 0.077848, map@100: 0.077981


In [116]:
common_actions_set = []
for locations in user_activity.location_id_bin:
    locations = json.loads(locations)
    common_actions_set.extend(set(locations))

In [117]:
len(common_actions_set)

84625

In [108]:
counter = Counter(common_actions_set)
counter.most_common(1)[0][0]

80

In [99]:
def test(model, test_dataloader, loss_fn):
    metrics_val = []
    model.eval()

    for test_emb_batch, test_labels_batch in test_dataloader:

        test_probs = model(test_emb_batch)
        
        test_pred = torch.randint(0, num_classes, (len(test_emb_batch),))
        test_one_hot = torch.zeros(len(test_emb_batch), num_classes)
        test_one_hot[torch.arange(len(test_one_hot)), test_labels_batch] = 1
        loss = loss_fn(test_probs, test_one_hot)

        hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100 \
        = get_metrics_(test_probs, test_labels_batch, test_one_hot)
        metrics_val.append([hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100])
        

    mean = torch.Tensor(metrics_val).mean(axis=0)
    test_hits1, test_hits5, test_hits10, test_hits20, test_hits50, test_hits100, test_map1, test_map5, test_map10, test_map20, test_map50, test_map100, \
    test_ndcg1, test_ndcg5, test_ndcg10, test_ndcg20, test_ndcg50, test_ndcg100 = mean
    return test_hits1, test_hits5, test_hits10, test_hits20, test_hits50, test_hits100, test_map1, test_map5, test_map10, test_map20, test_map50, test_map100, \
    test_ndcg1, test_ndcg5, test_ndcg10, test_ndcg20, test_ndcg50, test_ndcg100

In [100]:
def print_metrics(hits1, hits5, hits10, hits20, hits50, hits100, map1, map5, map10, map20, map50, map100, ndcg1, ndcg5, ndcg10, ndcg20, ndcg50, ndcg100):
    print(f'hits@1: {hits1:.6f}, hits@5: {hits5:.6f}, hits@10: {hits10:.6f}, hits@20: {hits20:.6f}')
    print(f'hits@50: {hits50:.6f}, hits@100: {hits100:.6f}')
    print(f'map@1: {map1:.6f}, map@5: {map5:.6f}, map@10: {map10:.6f}, map@20: {map20:.6f}')
    print(f'map@50: {map50:.6f}, map@100: {map100:.6f}')
    print(f'ndcg@1: {ndcg1:.6f}, ndcg@5: {ndcg5:.6f}, ndcg@10: {ndcg10:.6f}, ndcg@20: {ndcg20:.6f}')
    print(f'ndcg@50: {ndcg50:.6f}, ndcg@100: {ndcg100:.6f}')

In [101]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [102]:
loss_fn = nn.CrossEntropyLoss()
model  = DownstreamModel()

model.train()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

for i in range(100):
    for emb_batch, labels_batch in train_dataloader:
        optimizer.zero_grad()
        probs = model(emb_batch)
        pred = torch.argmax(probs, axis = 1)
        one_hot = torch.zeros(len(emb_batch), 186)
        one_hot[torch.arange(len(one_hot)), labels_batch] = 1
        loss = loss_fn(probs, one_hot)
        loss.backward()
        optimizer.step()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [103]:
test_hits1, test_hits5, test_hits10, test_hits20, test_hits50, test_hits100, test_map1, test_map5, test_map10, test_map20, test_map50, test_map100, test_ndcg1, test_ndcg5, test_ndcg10, test_ndcg20, test_ndcg50, test_ndcg100 = test(model, test_dataloader, loss_fn)
 
print_metrics(test_hits1, test_hits5, test_hits10, test_hits20, test_hits50, test_hits100, test_map1, test_map5, test_map10, test_map20, test_map50, test_map100, test_ndcg1, test_ndcg5, test_ndcg10, test_ndcg20, test_ndcg50, test_ndcg100)

hits@1: 0.038882, hits@5: 0.174852, hits@10: 0.313812, hits@20: 0.504027
hits@50: 0.794544, hits@100: 0.974990
map@1: 0.038882, map@5: 0.083155, map@10: 0.101334, map@20: 0.114455
map@50: 0.123708, map@100: 0.126403
ndcg@1: 0.038882, ndcg@5: 0.105640, ndcg@10: 0.150209, ndcg@20: 0.198200
ndcg@50: 0.255846, ndcg@100: 0.285428


In [None]:
!python3 -m pip3 install catboost

In [77]:
from sklearn.ensemble import GradientBoostingClassifier
user_target = pd.read_csv('user_activity_target.csv')
target_train = list(user_target['target_train'])
target_test = list(user_target['target_test'])
true_pred_train  = [gowalla_all_ids[1][x] for x in target_train]
true_pred_test = [gowalla_all_ids[1][x] for x in target_test]

In [78]:
embeddings = pd.read_pickle('train_nsp_embeddings.pickle')
test_embeddings = pd.read_pickle('test_nsp_embeddings.pickle')

X_train, X_test = embeddings[:500], test_embeddings[:500]
y_train, y_test = true_pred_train[:500], true_pred_test[:500]

In [11]:
from catboost import CatBoostClassifier

In [79]:
clf = GradientBoostingClassifier(n_estimators = 50, learning_rate=0.01, max_depth=5, random_state=0, verbose = 1)
clf.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           2.7816            1.87m
         2           2.6549            2.53m
         3           2.5576            2.69m
         4           2.4767            2.75m
         5           2.4066            2.74m
         6           2.3455            2.71m
         7           2.2902            2.68m
         8           2.2400            2.63m
         9           2.1932            2.58m
        10           2.1506            2.53m
        20           1.8208            1.92m
        30           1.5910            1.28m
        40           1.4122           38.48s
        50           1.2664            0.00s


In [80]:
clf.score(X_test, y_test)

0.198

In [73]:
from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=100)
neigh.fit(embeddings, true_pred)

In [74]:
neigh.score(X_test, y_test)

0.008333333333333333

In [76]:
neigh.classes_.shape

(18137,)