In [None]:
!nvidia-smi

In [None]:
# !kill -9 3239

In [None]:
import os
import pandas as pd
from pandas import Timestamp
import numpy as np
import random
from random import shuffle

import openai
import os, sys
import json
import re
from tqdm import tqdm

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

# Llama3 Inference

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Define the model ID you want to use
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Your Hugging Face token
hf_token = ""

In [None]:
# #### Without fine-tuning ####

# # model_id = "yjseo/test"

# # Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)

# # Configuration for loading the model with quantization
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=False,
#     load_in_4bit=True,
#     llm_int8_threshold=6.0,
#     llm_int8_skip_modules=None,
#     llm_int8_enable_fp32_cpu_offload=False,
#     llm_int8_has_fp16_weight=False,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=False,
#     bnb_4bit_compute_dtype="float16",
# )

# # Load the pretrained model with the specified quantization configuration
# basemodel = AutoModelForCausalLM.from_pretrained(
#     model_id, 
#     quantization_config=bnb_config,
#     device_map="auto",  # Automatically map the model to the available devices (e.g., GPU if available)
#     token=hf_token
# )
# basemodel.bfloat16()
# basemodel.eval()

# model = basemodel

In [None]:
#### Fine-tuned ####

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

peft_model_id = "./models/NYC/Gen-MMR-Bin"

config = PeftConfig.from_pretrained(peft_model_id)

bnb_config = BitsAndBytesConfig(
    load_in_8bit=False,
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None,
    llm_int8_enable_fp32_cpu_offload=False,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    token=hf_token,  
    quantization_config=bnb_config
)
model = PeftModel.from_pretrained(model, peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model.bfloat16()
model.eval()

In [None]:
# def llama3_inference(input_text):
#     # Tokenize the input text
#     # input_text.to('cuda')
#     inputs = tokenizer(input_text, return_tensors="pt")
    
#     # Move input tensor to the same device as the model
#     inputs = {key: tensor.to(basemodel.device) for key, tensor in inputs.items()}
    
#     # Generate text using the model
#     with torch.no_grad():
#         outputs = basemodel.generate(
#             **inputs,
#             max_new_tokens=512
#         )

#     # Decode the generated text
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

#     assistant_marker = "assistant"
#     assistant_position = generated_text.rfind(assistant_marker)
#     if assistant_position != -1:
#         generated_text = generated_text[assistant_position + len(assistant_marker):].strip()

#     return generated_text

In [None]:
def llama3_inference_with_probabilities(input_text):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt")
    
    # Move input tensor to the same device as the model
    inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
    
    # Generate text using the model
    with torch.no_grad():
        generated_outputs = model.generate(
            **inputs,
            max_new_tokens=512
        )

    # Tokenize the extended input text
    extended_inputs = tokenizer(input_text, return_tensors="pt")

    # Move input tensor to the same device as the model
    extended_inputs = {key: tensor.to(model.device) for key, tensor in extended_inputs.items()}

    # Get logits from the model
    with torch.no_grad():
        extended_outputs = model(**extended_inputs)

    # Apply softmax to logits to get probability distribution
    logits = extended_outputs.logits
    softmax = torch.nn.functional.softmax(logits, dim=-1)

    # Get the probability distribution of the last token
    last_token_logits = softmax[0, -1, :]

    # Create a dictionary of token IDs and their probabilities
    probabilities = {tokenizer.decode([i]): prob.item() for i, prob in enumerate(last_token_logits)}
    
    # Get the top 20 words with the highest probabilities
    top_words = sorted(probabilities.items(), key=lambda item: item[1], reverse=True)[:20]
    
    sorted_words = []
    for word, prob in top_words:
        sorted_words.append([word, prob])
    
    return sorted_words


In [None]:
def make_user_log(data, user_data, uid, poi_data, cnt):
    
    if data == "Yelp":
        user_log = pd.DataFrame()
        user_log['date'] = eval(user_data[user_data['hash_uid'] == uid]['visited_dates'][cnt])
        user_log['pid'] = eval(user_data[user_data['hash_uid'] == uid]['visited_pids'][cnt])
        place_name = []
        place_category = []
        place_address = []
        for j in user_log['pid']:
            place_name.append(poi_data[poi_data['pid'] == j]['placename'].fillna('').values[0])
            place_category.append(poi_data[poi_data['pid'] == j]['cat'].fillna('').values[0])
            address = poi_data[poi_data['pid'] == j]['addr'].fillna('').values[0]
            place_address.append(address.replace(",", ""))
        user_log['place_name'] = place_name
        user_log['place_category'] = place_category
        user_log['place_address'] = place_address
        user_log = user_log.drop(['pid'], axis=1)
        
    elif data == "NYC" or data == "Tokyo" :
        user_log = pd.DataFrame()
        user_log['date time'] = eval(user_data[user_data['hash_uid'] == uid]['visited_times'][cnt])
        user_log['pid'] = eval(user_data[user_data['hash_uid'] == uid]['visited_pids'][cnt])
        place_category = []
        place_address = []
        for j in user_log['pid']:
            place_category.append(poi_data[poi_data['pid'] == j]['cat'].fillna('').values[0])
            address = poi_data[poi_data['pid'] == j]['addr'].fillna('').values[0]
            place_address.append(address.replace(",", ""))
        user_log['place_category'] = place_category
        user_log['place_address'] = place_address
        user_log = user_log.drop(['pid'], axis=1)
        user_log['date'] = user_log['date time'].apply(lambda x: x.date())
        user_log['time'] = user_log['date time'].apply(lambda x: x.time())
        user_log = user_log.drop(['date time'], axis=1)
        user_log = user_log[['date', 'time', 'place_category', 'place_address']]
        
    return user_log

def make_candidates_list(data, user_data, uid, poi_data, cnt):
    
    candidates_pid = eval(user_data[user_data['hash_uid'] == uid]['candidates'][cnt])
    # print(len(candidates_pid))
    # print(candidates_pid)
    candidates_pid = candidates_pid[:9]
    # print(len(candidates_pid))
    target_pid = user_data[user_data['hash_uid'] == uid]['test_pid'][cnt]
    candidates_pid.append(target_pid)
    
    
    if data == "Yelp":
        candidates_list = pd.DataFrame()
        place_name = []
        place_category = []
        place_address = []
        for j in candidates_pid:
            place_name.append(poi_data[poi_data['pid'] == j]['placename'].fillna('').values[0])
            place_category.append(poi_data[poi_data['pid'] == j]['cat'].fillna('').values[0])
            address = poi_data[poi_data['pid'] == j]['addr'].fillna('').values[0]
            place_address.append(address.replace(",", ""))
        candidates_list['place_name'] = place_name
        candidates_list['place_category'] = place_category
        candidates_list['place_address'] = place_address
        candidates_list['pid'] = candidates_pid
        candidates_list.drop_duplicates(inplace=True)
        target_idx = candidates_list.index[candidates_list["pid"] == target_pid][0]
        candidates_list['final_score'] = [0]*len(candidates_list)
    
    elif data == "NYC" or data == "Tokyo" :
        candidates_list = pd.DataFrame()
        place_category = []
        place_address = []
        for j in candidates_pid:
            place_category.append(poi_data[poi_data['pid'] == j]['cat'].fillna('').values[0])
            address = poi_data[poi_data['pid'] == j]['addr'].fillna('').values[0]
            place_address.append(address.replace(",", ""))
        candidates_list['place_category'] = place_category
        candidates_list['place_address'] = place_address
        candidates_list['pid'] = candidates_pid
        candidates_list.drop_duplicates(inplace=True)
        target_idx = candidates_list.index[candidates_list["pid"] == target_pid][0]
        candidates_list['final_score'] = [0]*len(candidates_list)
        
    
    return candidates_list, target_idx

def tab_separated_format_user_log(data, user_log):
    
    if data == "Yelp":
        user_log_str = user_log.to_string()
        
        user_log_latest = user_log[-5:]
        user_log_latest_str = user_log_latest.to_string()
        
    elif data == "NYC" or data == "Tokyo" :
        user_log_str = user_log.to_string(col_space=[20, 20, 30, 30])
        
        user_log_latest = user_log[-5:]
        user_log_latest_str = user_log_latest.to_string(col_space=[20, 20, 30, 30])
    
    user_log_tab = 'index' + re.sub('  +', '\t', user_log_str)
    user_log_tab = re.sub('date hour', 'date\thour', user_log_tab)
    
    user_log_latest_tab = 'index' + re.sub('  +', '\t', user_log_latest_str)
    user_log_latest_tab = re.sub('date hour', 'date\thour', user_log_latest_tab)
    
    return user_log_tab, user_log_latest_tab

def tab_separated_format_candidates_list(data, candidates_list):
    
    if data == "Yelp":
        candidates_list = candidates_list[['place_name', 'place_category', 'place_address']]
        candidates_list_str = candidates_list.to_string()
        candidates_list_tab = 'index' + re.sub('  +', '\t', candidates_list_str)
        
    elif data == "NYC" or data == "Tokyo" :
        candidates_list = candidates_list[['place_category', 'place_address']]
        candidates_list_str = candidates_list.to_string(col_space=[30, 30])
        candidates_list_tab = 'index' + re.sub('  +', '\t', candidates_list_str)
    
    
    return candidates_list_tab
        

def zeropoirec(data, user_data, poi_data):
    print('Start recommendation using ZeroPOIRec!')
    
    # print('api_key : ', openai.api_key)
    
    uid_list = user_data['hash_uid']
    
    cnt = 0
    wrong_cnt = 0
    k_list = []
    
    # preference_df = pd.read_csv("./trainset/preference_yelp_0719.csv")
    preference_df = pd.read_csv("./trainset/preference_nyc.csv")
    
    global k_list_arr
    
    for i in tqdm(uid_list[:]):
        # try:
        # make user's log and candidates list
        user_log = make_user_log(data, user_data, i, poi_data, cnt)
        candidates_list, target_idx = make_candidates_list(data, user_data, i, poi_data, cnt)

        # data formatting
        user_log_tab, user_log_latest_tab = tab_separated_format_user_log(data, user_log)

        # extraction preference
        # preference_output = profiler_api(data, user_log_tab)
        
        preference_output = preference_df.loc[cnt+22, "preference"]

        # ensembling for consistency
        # change the order of candidates list
        for r in range(1):
            candidates_list_r = candidates_list.sample(frac=1).reset_index(drop=True)
            candidates_list_tab = tab_separated_format_candidates_list(data, candidates_list_r)

            res = recommender_api(data, preference_output, user_log_latest_tab, candidates_list_tab)
            res[f'r{r}_score'] = len(candidates_list) + 1 - res['rank']
            res = res.rename(columns={'index': f'r{r}_idx'})

            candidates_list_r_idx = candidates_list_r.reset_index().rename(columns={'index': f'r{r}_idx'})
            candidates_list = pd.merge(candidates_list, candidates_list_r_idx, how='left')
            candidates_list = pd.merge(candidates_list, res[[f'r{r}_idx', f'r{r}_score']], on=f'r{r}_idx', how='left').fillna(0)
            candidates_list['final_score'] += candidates_list[f'r{r}_score']


            # save the position of target place
            candidates_list_sorted = candidates_list.sort_values(by='final_score', ascending=False).reset_index()
            forecast_target = candidates_list_sorted[candidates_list_sorted['index'] == target_idx].index[0]
            k_list.append(forecast_target)
            # print(target_idx)
            # print(candidates_list)
            # print(candidates_list_r)
            # target_position_in_r = candidates_list_r[candidates_list_r['place_name'] == candidates_list.iloc[target_idx]['place_name']].index[0]
            # print(f"target_idx in candidates_list_r: {target_position_in_r}")

            print(forecast_target)
            
        # except:
        #     wrong_cnt += 1
        if cnt%100 == 0:
            k_list_arr = np.array(k_list)
            print(wrong_cnt)
            print(k_list_arr)

            for i in [1,2,3,5]:
                print('hit rate@',i)
                print(sum(k_list_arr < i) / len(k_list_arr))
        cnt += 1
        
    print('end recommendation!')

    k_list_arr = np.array(k_list)
    print(wrong_cnt)
    print(k_list_arr)
    
    for i in [1,2,3,5]:
        print('hit rate@',i)
        print(sum(k_list_arr < i) / len(k_list_arr))
            

In [None]:
def profiler_api(data, user_log, model= "gpt-3.5-turbo-0613", temperature = 0, verbose=False):
    
    if data == "Yelp":
        num_features = 5
        features_list = ['index', 'date', 'place_name', 'place_category', 'place_address']

    elif data == "NYC" or data == "Tokyo" :
        num_features = 5
        features_list = ['index', 'date', 'time', 'place_category', 'place_address']
    
    with open('./api/profiler_system_prompt.txt', 'r') as file:
        system_prompt = file.read()
    system_prompt = system_prompt.format(num_features=num_features, features_list=features_list)
    
    with open('./api/profiler_user_prompt.txt', 'r') as file:
        user_prompt = file.read()

    
    user_content = f"""{user_prompt}

    ```
    {user_log}
    ```

    """

    if verbose:
        print(user_content)

    # messages = [
    #             {"role": "system", "content": system_prompt},
    #             {"role": "user", "content": user_content}
    #             ]
    
    # response = openai.ChatCompletion.create(
    #     model=model,
    #     messages=messages,
    #     temperature=temperature,
    # )
    
    message = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>" + system_prompt + "<|eot_id|><|start_header_id|>user<|end_header_id|>" + user_content + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    
    # print(message)
    response = llama3_inference(message)
    

    output = response

    return output

In [None]:
def recommender_api(data, user_preference, user_recent_log, poi_list, model="gpt-3.5-turbo-0613", temperature = 0,  verbose=False):
    
    if data == "Yelp":
        num_features = 4
        features_list = ['index', 'place_name', 'place_category', 'place_address']

    elif data == "NYC" or data == "Tokyo" :
        num_features = 3
        features_list = ['index', 'place_category', 'place_address']
    
    with open('./api/recommender_system_prompt.txt', 'r') as file:
        system_prompt = file.read()
    system_prompt = system_prompt.format(num_features=num_features, features_list=features_list)
    
    with open('./api/recommender_user_prompt.txt', 'r') as file:
        user_prompt = file.read()
    
    user_content = f"""{user_prompt}

    ``` User Preference
    {user_preference}
    ```

    *** User's recently visited places
    {user_recent_log}
    ***

    ### List of 10 places
    {poi_list}
    ###

    """

    if verbose:
        print(user_content)

    # messages = [
    #             {"role": "system", "content": system_prompt},
    #             {"role": "user", "content": user_content}
    #             ]
    message = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>" + system_prompt + "<|eot_id|><|start_header_id|>user<|end_header_id|>" + user_content + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe index number of the selected place: "
    # print(message)

    # response = llama3_inference(message)
    response = llama3_inference_with_probabilities(message)
    print(response)
    
    result_str = ""
    count = 0

    # Step 1: Iterate over each sublist in the response
    for i, sublist in enumerate(response):
        # Check if the first element of the sublist is a digit between 0 and 9
        if len(sublist[0]) == 1 and sublist[0].isdigit() and 0 <= int(sublist[0]) <= 9:
            # Add the formatted string to the result string
            result_str += f'{{"rank": {i+1}, "index": {sublist[0]}}}\n'
            count += 1
        if count == 10:
            break
    
    print(result_str)
    output = result_str

    try:
        data_list = [json.loads(line) for line in output.split('\n') if line]
        df = pd.DataFrame(data_list)
        # print(df)
    except:
        df = output
        # print(output)

    return df

In [None]:
# def main(data):
#     print("------------------------------------------------------------------------")
#     print("This code is for running ZeroPOIRec.")
#     # print("You need your api key from OPEN AI.")
#     print("------------------------------------------------------------------------")

# #     if len(sys.argv) != 2:
# #         print("Run Cmd: python main.py data")
# #         print("\nParamters -----------------------------------------------------------")
# #         print("1. data: {Yelp, NYC, Tokyo}")
        
# #         sys.exit(-1)

# #     # For user parameters
# #     data = sys.argv[1]
    
#     if data == "Yelp":
#         user_data = pd.read_csv('./datasets/preprocessed_yelp_100.csv')
#         # user_data = pd.read_csv('./datasets/preprocessed_yelp_hard.csv')
#         poi_data = pd.read_csv('./datasets/business_info_yelp.csv')
        
#         # For testing
#         print(user_data.shape)
#         user_data = user_data.head(100)
        
#         poi_data['addr'] = (poi_data['address'].fillna('') + ' ' +\
#                   poi_data['city'].fillna('') + ' ' +\
#                   poi_data['state'].fillna('')).replace(',', '')
#         poi_data['cat'] = poi_data['categories'].fillna('').apply(lambda x: x.split(',')[0])
#         poi_data.rename(columns = {'business_id' : 'pid', 'name' : 'placename'}, inplace=True)
        
#     elif data == "NYC":
#         user_data = pd.read_csv('./datasets/preprocessed_nyc_100.csv')
#         poi_data = pd.read_csv('./datasets/business_info_nyc.csv')
#         poi_data.rename(columns = {'venueId' : 'pid', 'venueCategory' : 'cat', 'address' : 'addr'}, inplace=True)
        
#     elif data == "Tokyo":
#         user_data = pd.read_csv('./datasets/preprocessed_tky_100.csv')
#         poi_data = pd.read_csv('./datasets/business_info_tky.csv')
#         poi_data.rename(columns = {'venueId' : 'pid', 'venueCategory' : 'cat', 'address' : 'addr'}, inplace=True)

#     # Parameters for GPT API
#     # model = "gpt-3.5-turbo-0613"
#     # temperature = 0
    
#     # start recommend
#     zeropoirec(data, user_data, poi_data)

In [None]:
# main("Yelp")

In [None]:
# print(k_list_arr)

# for i in [1,2,3,5]:
#     print('hit rate@',i)
#     print(sum(k_list_arr < i) / len(k_list_arr))

In [None]:
def main(data):
    print("------------------------------------------------------------------------")
    print("This code is for running ZeroPOIRec.")
    # print("You need your api key from OPEN AI.")
    print("------------------------------------------------------------------------")

#     if len(sys.argv) != 2:
#         print("Run Cmd: python main.py data")
#         print("\nParamters -----------------------------------------------------------")
#         print("1. data: {Yelp, NYC, Tokyo}")
        
#         sys.exit(-1)

#     # For user parameters
#     data = sys.argv[1]
    
    if data == "Yelp":
        # user_data = pd.read_csv('./datasets/preprocessed_yelp_100.csv')
        user_data = pd.read_csv('./datasets/preprocessed_yelp_100_for_test.csv')
        poi_data = pd.read_csv('./datasets/business_info_yelp.csv')
        
        # For testing
        print(user_data.shape)
        # user_data = user_data.head(100)
        user_data = user_data.iloc[268:].reset_index(drop=True)
        
        poi_data['addr'] = (poi_data['address'].fillna('') + ' ' +\
                  poi_data['city'].fillna('') + ' ' +\
                  poi_data['state'].fillna('')).replace(',', '')
        poi_data['cat'] = poi_data['categories'].fillna('').apply(lambda x: x.split(',')[0])
        poi_data.rename(columns = {'business_id' : 'pid', 'name' : 'placename'}, inplace=True)
        
    elif data == "NYC":
        user_data = pd.read_csv('./datasets/preprocessed_nyc_100_for_test.csv')
        poi_data = pd.read_csv('./datasets/business_info_nyc.csv')
        user_data = user_data.iloc[22:].reset_index(drop=True)
        
        poi_data.rename(columns = {'venueId' : 'pid', 'venueCategory' : 'cat', 'address' : 'addr'}, inplace=True)
        
    elif data == "Tokyo":
        user_data = pd.read_csv('./datasets/preprocessed_tky_100.csv')
        poi_data = pd.read_csv('./datasets/business_info_tky.csv')
        poi_data.rename(columns = {'venueId' : 'pid', 'venueCategory' : 'cat', 'address' : 'addr'}, inplace=True)

    # Parameters for GPT API
    # model = "gpt-3.5-turbo-0613"
    # temperature = 0
    
    # start recommend
    zeropoirec(data, user_data, poi_data)

In [None]:
main("NYC")

In [None]:
print(k_list_arr)

for i in [1,2,3,5]:
    print('hit rate@',i)
    print(sum(k_list_arr < i) / len(k_list_arr))

In [None]:
# 저장할 경로
save_path = './results/NYC/Gen-MMR-Bin_iter2.txt'

# 디렉토리가 없을 경우를 대비해 생성
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# 배열을 텍스트 파일로 저장 (한 줄에 하나씩 저장됨)
np.savetxt(save_path, k_list_arr, fmt='%d')

In [None]:
main("NYC")

In [None]:
print(k_list_arr)

for i in [1,2,3,5]:
    print('hit rate@',i)
    print(sum(k_list_arr < i) / len(k_list_arr))
    
# 저장할 경로
save_path = './results/NYC/Gen-MMR-Bin_iter2.txt'

# 디렉토리가 없을 경우를 대비해 생성
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# 배열을 텍스트 파일로 저장 (한 줄에 하나씩 저장됨)
np.savetxt(save_path, k_list_arr, fmt='%d')

In [None]:
main("NYC")

In [None]:
print(k_list_arr)

for i in [1,2,3,5]:
    print('hit rate@',i)
    print(sum(k_list_arr < i) / len(k_list_arr))
    
# 저장할 경로
save_path = './results/NYC/Gen-MMR-Bin_iter3.txt'

# 디렉토리가 없을 경우를 대비해 생성
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# 배열을 텍스트 파일로 저장 (한 줄에 하나씩 저장됨)
np.savetxt(save_path, k_list_arr, fmt='%d')

In [None]:
import numpy as np

for iter_num in [1, 2, 3]:
    file_path = f'./results/NYC/Gen-MMR-Bin_iter{iter_num}.txt'
    
    # 파일에서 순위 리스트 불러오기
    with open(file_path, 'r') as f:
        # 숫자들만 추출 (공백 또는 쉼표 구분 가능)
        k_list_arr = np.array([int(x) for x in f.read().split()])
    
    print(f'=== Iteration {iter_num} ===')
    # print('k_list_arr:', k_list_arr.tolist())  # or print(k_list_arr) for array format

    for k in [1, 2, 3, 5]:
        hit_rate = np.mean(k_list_arr < k)
        print(f'hit rate@{k}: {hit_rate:.4f}')
    print()

In [None]:
import os
import signal

os.kill(os.getpid(), signal.SIGTERM)