In [1]:
import argparse
import numpy as np
from utils.utils import to_tensor
from utils.dataset import Dataset,DataIterator,get_DataLoader
from tqdm import tqdm
def parse_args(name):   
    parser = argparse.ArgumentParser(description="Run .")  
    parser.add_argument('--model', nargs='?', default='FPMC')
    parser.add_argument('--dataset', nargs='?', default=name,
                        help='Choose a dataset.')
    parser.add_argument('--batch_size', type=int, default=1024,
                        help='Batch size.')
    parser.add_argument('--teacher_dims', type=int, default=100,
                        help='Number of hidden factors for teacher.')
    parser.add_argument('--topN', type=int, default=50,
                        help='Learning rate.') 
    return parser.parse_args(args=[])

In [3]:
args = parse_args('ML')
data = Dataset(args)
train_data =  get_DataLoader(data.train,args.batch_size, seq_len=10)
valid_data =  get_DataLoader(data.valid,args.batch_size, seq_len=10,train_flag=0)
test_data =  get_DataLoader(data.test,args.batch_size, seq_len=10,train_flag=0)


loading data: meta_data

loading data: interaction_data

split data

Using time span 128
total session: 7393
Using time span 128
total session: 2465
Using time span 128
total session: 2465


In [5]:
from utils.utils import calculate_session_embs
from utils.log import LOG, load_model, save_model

if args.model =='GRU4REC':
    from GRU4REC import GRU4Rec
    model = GRU4Rec(data.n_item,args.hidden_factor,args.batch_size)
if 'FPMC' in args.model:
    from base_FPMC import FPMC
    BASE = FPMC

    
    best_model_path_teacher = './best_model/'+'%s+FPMC_T'%args.dataset
    best_model_path_student = './best_model/'+'%s+FPMC_S'%args.dataset
#load Teacher model    
teacher_model = BASE(data.n_item,args.teacher_dims,args.batch_size)    
load_model(teacher_model, best_model_path_teacher)
teacher_model = teacher_model.cuda()
teacher_model.eval()   



model loaded from ./best_model/ML+FPMC_T


FPMC(
  (LI_emb): Embedding(8475, 100, padding_idx=0)
  (item_emb): Embedding(8475, 100)
  (loss_func): BPRLoss()
)

In [6]:
train_data_flag_zero =  get_DataLoader(data.train,args.batch_size, seq_len=10,train_flag=0)
for iter, (targets, items, mask, session_id) in tqdm(enumerate(train_data_flag_zero)):
    #模型预测性
    targets_cuda = to_tensor(targets,'cuda')
    items_cuda = to_tensor(items,'cuda')
    mask_cuda = to_tensor(mask,'cuda')
    session_id_cuda = to_tensor(session_id,'cuda')
    teacher_model(items_cuda,mask_cuda)[0].cpu().detach().numpy()
    

Using time span 128
total session: 7393


8it [00:00, 14.59it/s]


In [8]:
import pandas as pd
df = pd.read_pickle('./datasets/%s_prediction.pkl'%args.dataset)

In [9]:
df

Unnamed: 0,session_id,target,feature,teacher_position,student_position,teacher_top_item,student_top_item_,loss_user
0,0,1240,"[265, 2015, 1313, 528]",0,1,"[1240, 2325, 4326, 1322, 1800, 6085, 933, 4711...","[1322, 1240, 4509, 4711, 878, 2500, 4502, 6401...",0.496731
1,1,5158,"[7434, 4139, 1438, 7182, 1206, 2731, 5455, 659...",0,213,"[5158, 2264, 7336, 7649, 1714, 2585, 6634, 999...","[386, 5505, 3433, 2586, 1083, 1518, 322, 3157,...",0.525201
2,2,217,"[5677, 2477, 267, 94, 929, 916]",0,7,"[217, 2082, 4270, 2058, 918, 282, 5629, 540, 4...","[715, 4270, 2107, 2058, 3393, 2082, 935, 217, ...",0.602745
3,3,4957,"[5895, 7153, 2826, 4399, 4856]",0,5,"[4957, 2119, 5003, 2908, 2974, 3053, 20, 953, ...","[2974, 844, 1777, 1511, 5906, 4957, 5890, 2932...",0.530305
4,4,5890,"[4904, 1993, 1045, 3889, 1099, 815, 2133, 2592]",0,2,"[5890, 6254, 3972, 753, 1302, 5023, 7378, 7532...","[3186, 6559, 5890, 2991, 4302, 2893, 3213, 215...",0.504147
...,...,...,...,...,...,...,...,...
7388,7388,6396,"[7167, 7002, 7009, 7148, 4966]",0,46,"[6396, 6250, 5840, 7434, 7235, 6646, 5971, 659...","[1294, 4289, 6990, 4506, 6213, 6122, 5490, 639...",0.507500
7389,7389,3545,"[6646, 1198, 1962, 2124]",0,20,"[3545, 3145, 1264, 974, 633, 2974, 1020, 3425,...","[4302, 1355, 633, 3972, 5890, 2401, 2559, 4280...",0.534481
7390,7390,1930,"[7401, 878, 884, 7055, 1800]",0,9,"[1930, 6627, 7274, 2638, 904, 4139, 929, 4325,...","[3953, 942, 3134, 335, 4897, 7274, 5901, 715, ...",0.492367
7391,7391,3148,"[7145, 976, 7492, 4713, 2023, 5607, 7317, 249,...",0,61,"[3148, 2805, 987, 7354, 9, 3170, 929, 7326, 65...","[7517, 3892, 1452, 267, 7530, 7434, 1358, 7342...",0.539825


In [17]:
class Prompt_class(object):
    def __init__(self,train_df,data):
        self.train_df = train_df
        self.data = data
    def ids2names(self,ids):
        r = '{'
        for item in ids:
            r = r + '<ID_%d:'%item+str(self.data.id2title[item])+'>, '
        r = r[:-2] + '}'
        return r
    def generate_prompt(self,idxes):

        p1 = "You are an AI assistant, please STRICTLY summarize the main logic of the recommendation results provided by MY OWN recommender system based on users' behaviors."
        p2 = "Specifically, the users has interacted with several %s (i.e., users' behaviors). Thee recommender system provides Top-20 items (i.e., recommendation results) based on users behaviors, where item in the top position indicates the higher recommendation intent.\n"%args.dataset
        p3 = "We provide %d cases for you to summarize the relationship between users' behaviors and recommendation results:\n"%(len(idxes))
        p4 = ""

        for i,idx in enumerate(idxes):
            df_idx = self.train_df[self.train_df['session_id']==idx].iloc[0]
            candidate = list(df_idx['teacher_top_item'])
            if df_idx['target'] not in candidate:
                candidate = candidate[:-1]
                candidate.append(df_idx['target'])
            feature = set(df_idx['feature'] )  
            p4 = p4 + "For case %d, the user's interactions are {%s}. The recommendation results are {%s}\n"%(i,self.ids2names(feature),self.ids2names(candidate))
        
        p5 = "Do not output the detail analysis of specific items, and make high-level summarization JUST WITH PROVIDED CASES, e.g., 1. Ignore diversity in genres or series. For example, Although the user already owns this one, there would still be interest in same one, e.g., its the special version."
        Prompt = p1 + p2 + p3 + p4 + p5
        # print(Prompt)
        # print('\n\n\n')
        # print('Groud Truth:%s'%self.ids2names([df_idx['target']]))
        return Prompt,df_idx['target']

In [18]:
promp_generator = Prompt_class(df,data)

In [19]:
p,gt = promp_generator.generate_prompt(np.random.randint(0,7000,20))


In [20]:
print(p)

You are an AI assistant, please STRICTLY summarize the main logic of the recommendation results provided by MY OWN recommender system based on users' behaviors.Specifically, the users has interacted with several ML (i.e., users' behaviors). Thee recommender system provides Top-20 items (i.e., recommendation results) based on users behaviors, where item in the top position indicates the higher recommendation intent.
We provide 20 cases for you to summarize the relationship between users' behaviors and recommendation results:
For case 0, the user's interactions are {{<ID_2243:American Beauty>, <ID_310:The Mask>, <ID_6680:Control Room>, <ID_345:Airheads>, <ID_1178:Austin Powers: International Man of Mystery>, <ID_315:Reality Bites>}}. The recommendation results are {{<ID_319:The River Wild>, <ID_473:The Nightmare Before Christmas>, <ID_140:Hackers>, <ID_2100:Austin Powers: The Spy Who Shagged Me>, <ID_3588:The Return of the Living Dead>, <ID_3937:The Fall Guy>, <ID_495:Home Alone>, <ID_50

In [34]:
gt

17388