In [1]:
import numpy as np
import logging
import argparse
import multiprocessing as mp
from time import time
from tensorflow.keras.layers import Dense, Flatten, Input, Embedding, Multiply, Concatenate
from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import tensorflow as tf

In [9]:
import numpy as np
import pandas as pd
import os
import pickle

In [3]:

def get_neumf_model(num_users, num_items, layers=[128, 64, 32, 16], reg=["l1"]):
    user_input = Input(shape=(1,), dtype="int32", name="user_input")
    item_input = Input(shape=(1,), dtype="int32", name="item_input")

    gmf_emb_user = Embedding(input_dim=num_users, output_dim=emb_dim,
                             name="gmf_user_emb", input_length=1)
    gmf_emb_item = Embedding(input_dim=num_items, output_dim=emb_dim,
                             name="gmf_item_emb", input_length=1)
    gmf_user_emb = Flatten()(gmf_emb_user(user_input))
    gmf_item_emb = Flatten()(gmf_emb_item(item_input))
    gmb_vector_layer = Multiply()([gmf_user_emb, gmf_item_emb])

    mlp_emb_user = Embedding(input_dim=num_users, output_dim=int(layers[0] / 2),
                             name="mlp_user_emb", input_length=1)
    mlp_emb_item = Embedding(input_dim=num_items, output_dim=int(layers[0] / 2),
                             name="mlp_item_emb", input_length=1)
    mlp_user_emb = Flatten()(mlp_emb_user(user_input))
    mlp_item_emb = Flatten()(mlp_emb_item(item_input))
    mlp_vector_layer = Concatenate()([mlp_user_emb, mlp_item_emb])

    for idx in range(1, len(layers)):
        mlp_vector_layer = Dense(layers[idx], activation='relu', kernel_initializer="lecun_uniform",
                                 name="mlp_layer_" + str(idx))(mlp_vector_layer)

    predict_vector = Concatenate()([gmb_vector_layer, mlp_vector_layer])
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name='prediction')(predict_vector)
    model = Model(inputs=[user_input, item_input],
                  outputs=prediction)
    return gmf_emb_user, gmf_emb_item, model


In [4]:
num_users = 31568
num_items = 9065
emb_dim = 64
user_emb, item_emb, model = get_neumf_model(num_users, num_items)

In [5]:
model.load_weights("/Users/wizardholy/project/recsys_learning/Pretrain/info_gmf_64_[64,32,16,8]_1585834028.h5")

In [6]:
print(model.get_layer("mlp_user_emb"))

<tensorflow.python.keras.layers.embeddings.Embedding object at 0x13ad66cc0>


In [7]:
embs = model.get_layer("mlp_user_emb").embeddings.numpy()

In [10]:
embs

array([[ 0.09206976,  0.09069787,  0.08962332, ..., -0.10332184,
        -0.0477012 ,  0.03304849],
       [ 0.11864064,  0.1795334 ,  0.14786634, ..., -0.14277941,
         0.04744938,  0.01004081],
       [ 0.0029698 ,  0.04394848,  0.1321151 , ..., -0.13439359,
         0.01134982,  0.04418117],
       ...,
       [-0.12807289,  0.00266701, -0.03689078, ..., -0.10288961,
         0.07840523,  0.08569863],
       [ 0.12411491,  0.07306439,  0.11994545, ..., -0.14892985,
        -0.11625966,  0.00325219],
       [ 0.1462021 ,  0.11518549,  0.16165425, ..., -0.0591514 ,
        -0.08591385,  0.02107427]], dtype=float32)

In [12]:
with open('/Users/wizardholy/project/recsys_learning/emb.pickle', 'wb') as f:
    pickle.dump(embs, f)

In [89]:
with open('/Users/wizardholy/project/recsys_learning/emb.pickle', 'rb') as f:
    dt = pickle.load(f)

In [90]:
dt.shape

(31568, 64)

In [91]:
def load_emb(file):
    ret = list()
    with open(file, encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) > 0:
                ret.append([float(item) for item in line.split("\t")[1].split(",")])
    return ret

In [92]:
df = load_emb("/Users/wizardholy/project/recsys_learning/emb.txt")

In [98]:
df = np.array(df).astype('float32')

In [99]:
df.shape

(65235, 64)

In [100]:
import faiss

In [101]:
nlist = 100
m = 8
k = 4
d = 64
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)

In [102]:
index.train(df)

In [103]:
index.add(df)

In [105]:
D, I = index.search(df, 50)      # 测试


In [106]:
D

array([[0.1020731 , 0.37405637, 0.3976307 , ..., 0.5006512 , 0.5006513 ,
        0.5024627 ],
       [0.13507631, 0.44116896, 0.4582958 , ..., 0.5585234 , 0.5593968 ,
        0.55991834],
       [0.10411835, 0.32663786, 0.33381146, ..., 0.45915604, 0.46012306,
        0.46013343],
       ...,
       [0.09046736, 0.3732167 , 0.3913887 , ..., 0.5122074 , 0.51235354,
        0.5125258 ],
       [0.11361209, 0.33789164, 0.35315442, ..., 0.48606217, 0.4866038 ,
        0.4869817 ],
       [0.11381584, 0.3970833 , 0.43579224, ..., 0.53934324, 0.541063  ,
        0.5428909 ]], dtype=float32)

In [109]:
sim = I.tolist()

In [111]:
len(sim[0])

50

In [120]:
sim = I.tolist()
weights = D.tolist()

In [115]:
def load_uid_map(file):
    ret = dict()
    with open(file, encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) > 0:
                ps = line.split("\t")
                ret[int(ps[1])] = ps[0]
    return ret

In [116]:
uid_map = load_uid_map("/Users/wizardholy/project/recsys_learning/datas/info/info.uid.map.txt")


In [121]:
with open("/Users/wizardholy/project/recsys_learning/sim_user.txt", mode="w") as f:
    for i in range(10):
        uid = uid_map[i]
        outs = []
        for j in range(1, len(sim[i])):
            suid = uid_map[sim[i][j]]
            weight = weights[i][j]
            outs.append("" + suid + "#" + str(weight))
        f.write(uid+" "+(",".join(outs))+"\n")
    

KeyError: 61852