In [76]:
import pandas as pd
import requests
import urllib
import json
from urllib.parse import urlencode
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import datetime
from dateutil.parser import parse
import pickle
import torch
import numpy as np
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Subset
import random

### Loading Data

In [11]:
class Dataset():

  def __init__(self, link, filename, graph_link, graph_filename):
    self.data = self.dataset_txt_file_download(link, filename)
    self.friends = self.dataset_txt_file_download(graph_link, graph_filename)

  def dataset_txt_file_download(self, link, file_name):
    base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
    public_key = link
    final_url = base_url + urlencode(dict(public_key=public_key))
    response = requests.get(final_url)
    download_url = response.json()['href']
    download_response = requests.get(download_url)

    with open(file_name, 'wb') as f:
      f.write(download_response.content)

    df = pd.read_csv(file_name, sep="\t", header = None)
    return df

  def preprocess(self, num_quantiles, num_users):
    self.data.columns = ['user', 'check-in time', 'latitude', 'longitude', 'location_id']
    self.data['longitude_bin'] = pd.qcut(self.data.longitude, q=num_quantiles, labels = [i for i in range(0, 15)])
    self.data['latitude_bin'] = pd.qcut(self.data.latitude, q=num_quantiles, labels = [i for i in range(0, 15)])
    self.data['longitude_bin'] = self.data['longitude_bin'].astype('int64')
    self.data['latitude_bin'] = self.data['latitude_bin'].astype('int64')
    self.data['location_id_bin'] = list(zip(self.data.latitude_bin, self.data.longitude_bin))
    self.data['place'] = list(zip(self.data.latitude, self.data.longitude))
    self.data['location_id_bin'] = self.data['location_id_bin'].astype('str')

    self.friends.columns = ['1st friend', '2nd friend']

    le = LabelEncoder()
    self.data.location_id_bin = le.fit_transform(self.data.location_id_bin.values)
    self.sectors = self.data.groupby('location_id_bin').agg({'place':lambda x: list(x)}).reset_index()

    self.data = self.data.sort_values('check-in time')
    self.data['check-in time'] = pd.to_datetime(self.data['check-in time'])
    self.data["check-in time"] = self.data['check-in time'].map(lambda x: int(x.timestamp()))

    self.users = self.data.groupby('user').agg({'location_id_bin':lambda x: list(x), 'check-in time':lambda x: list(x)}).reset_index()
    self.users = self.users[self.users.location_id_bin.map(lambda x: len(x) > 3)]

    self.friends = self.friends[self.friends['1st friend'].isin(self.users.user.unique())].reset_index().drop(['index'], axis=1)
    self.friends = self.friends[self.friends['2nd friend'].isin(self.users.user.unique())].reset_index().drop(['index'], axis=1)

    self.reset_id()
    self.reset_edges()
    self.reset_users()

    self.users = self.users[self.users['user'] <= num_users]

    self.friends = self.friends[(self.friends['1st friend'] <= num_users )& (self.friends['2nd friend'] <= num_users)]


  def reset_id(self):
    self.user_id = {}
    id = 0
    for user in self.users.user:
      self.user_id[user] = id
      id += 1
    self.users = self.users.reset_index()

  def reset_edges(self):
    for edge in range(0, self.friends.shape[0]):
      self.friends.loc[edge, '1st friend'] = self.user_id[self.friends.loc[edge, '1st friend']]
    for edge in range(0, self.friends.shape[0]):
      self.friends.loc[edge, '2nd friend'] = self.user_id[self.friends.loc[edge, '2nd friend']]

  def reset_users(self):
    for user in range(0, self.users.shape[0]):
      self.users.loc[user, 'user'] = self.user_id[self.users.loc[user, 'user']]

### Converting to JSON

In [12]:
def gather_user_history(act_list, time_list, n_users, n_bins, n_context):
    user_history = np.zeros((n_users, n_bins, n_context), dtype=np.int32)
    for u in range(0, n_users):
        one_act_list = act_list[u]
        one_time_list = time_list[u]
        for t in range(0, n_bins):
            t_list = [i for i, x in enumerate(one_time_list) if x == t]
            loop_t = t - 1
            if loop_t >= 0:
                while len(t_list) < n_context:
                    temp_list = [i for i, x in enumerate(one_time_list) if x == loop_t]
                    t_list = temp_list + t_list
                    loop_t -= 1
                    if loop_t - 1 < 0:
                        break
            if len(t_list) == 0:
                t_list = [0] * n_context
            now_index = t_list[-n_context:]
            begin_ind = now_index[0]
            end_ind = now_index[-1]
            current_history = one_act_list[begin_ind: end_ind + 1]
            if len(current_history) < n_context:
                current_history = [0] * (n_context - len(current_history)) + current_history
            user_history[u, t, :] = current_history

    return user_history

In [16]:
def load_dataset_timestamp(users, n_context, seq_len):
    act_list = list()
    time_list = list()
    user_list = list()
    max_timestamp = -1.0
    min_timestamp = float('inf')
    for i in range(users.shape[0]):
      t_item_list = list()
      t_time_list = list()

      user = users.loc[i, 'user']
      entries = users.loc[i, 'location_id_bin']
      for j in range(len(entries)):
        item, time_stamp = users.loc[i, 'location_id_bin'][j], users.loc[i, 'check-in time'][j]
        t_item_list.append(int(item))
        t_time_list.append(int(time_stamp))

        if min_timestamp > int(time_stamp):
            min_timestamp = int(time_stamp)
        if max_timestamp < int(time_stamp):
            max_timestamp = int(time_stamp)

      act_list.append(t_item_list[0: seq_len])
      time_list.append(t_time_list[0: seq_len])
      user_list.append(user)
    print('act list', len(act_list))
    print('time list', len(time_list))
    print('user list', len(user_list))

    print(max_timestamp, min_timestamp)

    new_time_list = list()
    num_bins = 0


    num_bins = 12
    min_seq_len = 25


    times_bins = np.linspace(min_timestamp, max_timestamp + 1, num=num_bins, dtype=np.int32)
    for a_time_list in time_list:
        temp_time_list = (np.digitize(np.asarray(a_time_list), times_bins) - 1).tolist()
        new_time_list.append(temp_time_list)
    print(len(time_list), len(new_time_list))

    n_users = users.shape[0]
    user_history = gather_user_history(act_list, new_time_list, n_users, num_bins, n_context)

    all_examples = []
    for i in range(0, len(act_list)):

        act_seq = act_list[i]
        time_seq = new_time_list[i]


        entry = {'items': act_seq,
            'timestamps': time_seq,
            'seq_len': len(act_seq),
            'user' : user_list[i]
        }

        all_examples.append(entry)

    return all_examples, user_history, num_bins

In [14]:
dataset = Dataset('https://disk.yandex.ru/d/KVm7gIom6YooWA', 'Gowalla_totalCheckins.txt', 'https://disk.yandex.ru/d/Sr3FpAa7WG4GdA', 'Gowalla_edges.txt')
dataset.preprocess(15, 20001)

In [17]:
data_examples, user_history, num_bins = load_dataset_timestamp(dataset.users, 128, 100)

act list 20002
time list 20002
user list 20002
1287776788 1233724658
20002 20002


In [18]:
data_examples[1]['items']

[24, 35, 35, 35, 173, 173, 173, 173, 173, 185, 185, 35]

In [19]:
class UserDataset():
  def __init__(self, data, max_len):
    self.data = data
    self.max_len = max_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):

    user = self.data[idx]
    seq_len = user['seq_len']

    tr_act_seq = np.zeros((self.max_len,)).astype('int32')
    tr_act_seq[:seq_len] = np.array(user['train_act_seq'])
    tr_act_seq = np.transpose(tr_act_seq)

    tr_time_seq = np.zeros((self.max_len,)).astype('int32')
    tr_time_seq[:seq_len] = user['train_time_seq']
    tr_time_seq = np.transpose(tr_time_seq)

    t_act_seq = np.zeros((self.max_len, )).astype('int32')
    t_act_seq[:seq_len] = user['test_act_seq']
    t_act_seq = np.transpose(t_act_seq)

    t_time_seq = np.zeros((self.max_len, )).astype('int32')
    t_time_seq[:seq_len] = user['test_time_seq']
    t_time_seq = np.transpose(t_time_seq)


    return user['user'], tr_act_seq, \
    tr_time_seq, user['train_act_label'], \
    user['train_time_label'], t_act_seq, \
    t_time_seq, user['test_act_label'], \
    user['test_time_label'], user['seq_len']

In [78]:
class RecModel(nn.Module):
  def __init__(self):
    super(RecModel, self).__init__()
    self.fc1 = nn.Linear(128, 186)
    self.rnn = nn.RNN(128, 128, batch_first = True)
    self.norm = nn.BatchNorm1d(128)

  def forward(self, x, seq_len):

    x, h = self.rnn(x)
    hx = torch.zeros(x.shape[0], x.shape[2])
    for i in range(hx.shape[0]):
      hx[i] = x[i][seq_len - 1]
    # hx = self.norm(hx)

    x = self.fc1(hx)

    return x

### Main Recommender

In [71]:
class Recommender():

  def __init__(self, num_classes):
    self.item_emb  = nn.init.xavier_uniform_(torch.empty(num_classes, 128))


  def get_rec(self, user_json):
    comb_input = np.concatenate([np.expand_dims(user_json['items'], axis=-1),
                                                np.expand_dims(user_json['timestamps'], axis=-1)], axis=1)

    input_emb = self.item_emb[comb_input[:, 0]]
    input_emb = input_emb.unsqueeze(0)
    rec_model = RecModel()

    # load saved model
    rec_model.load_state_dict(torch.load('rec_model.pth'))
    probs = rec_model(input_emb, user_json['seq_len'])
    prediction = torch.argmax(probs, axis = 1).item()
    places = self.get_places(prediction)

    return self.rec_to_json(places, user_json['user'])

  def rec_to_json(self, places, user_id):
    rec_answer = {}
    rec_answer['user'] = user_id
    rec_answer['recommendations'] = []
    for place in places:
      rec_answer['recommendations'].append({'latitude' : place[0], 'longitude' : place[1]})
    return rec_answer


  def get_places(self, prediction):
    sector = dataset.sectors.loc[prediction, 'place']
    indices = np.arange(len(sector))
    random_indices = random.choices(indices, k=10)

    places = []
    for index in random_indices:
      places.append(sector[index])
    return places

In [79]:
recommender = Recommender(186)
recommendation = recommender.get_rec(data_examples[1])

Ответ модели на запрос в формате JSON - рекомендуется 10 пар координат (мест)

In [80]:
print(recommendation)

{'user': 1, 'recommendations': [{'latitude': 47.6136782429, 'longitude': -122.3166704178}, {'latitude': 48.46635608, 'longitude': -123.30949773}, {'latitude': 47.6133123, 'longitude': -122.34547317}, {'latitude': 47.6086585858, 'longitude': -122.3407244682}, {'latitude': 48.38932338, 'longitude': -122.498170007}, {'latitude': 48.42887055, 'longitude': -123.360541}, {'latitude': 47.6110780628, 'longitude': -122.337256372}, {'latitude': 49.002005803, 'longitude': -122.7564239502}, {'latitude': 47.61769525, 'longitude': -122.3457130667}, {'latitude': 47.6191235592, 'longitude': -122.3487303387}]}
