### 1. sentiment clf 용 데이터 전처리
- output: `senti_{test}_data_processed.jsonl`
- 동일 발화자의 발화 병합, role 구분

In [37]:
from tqdm import tqdm
import json

def preprocess(data_file, out_file, dpath):
    with open(dpath + data_file, 'r', encoding='utf-8') as fin, open(dpath + out_file, 'w', encoding='utf-8') as fout:
        for line in tqdm(fin):
            conversation = json.loads(line)
            if len(conversation['messages']) == 0:
                continue
            
            utt_id = 0
            processed_data = {
                "conv_id": conversation["conversationId"],
                "dialog": [],
                "movieMentions": conversation["movieMentions"],
                "seekerPolarity": conversation["initiatorQuestions"]
            }
            
            previous_sender_id = None
            current_message = None
            
            for message in conversation["messages"]:
                # Role 결정
                if message["senderWorkerId"] == conversation["initiatorWorkerId"]:
                    role = "Seeker"
                else:
                    role = "Recommender"
                    
                if message["senderWorkerId"] == previous_sender_id:
                    # 동일 발화자면 이전 내용과 결합
                    current_message["text"] += " " + message["text"]
                    current_message["movies"] += message["movie"]
                    current_message["entity"] += message["entity"]
                else:
                    if current_message:
                        processed_data["dialog"].append(current_message)
                        utt_id += 1

                    current_message = {
                        "utt_id": utt_id,
                        "role": role,
                        "text": message["text"],
                        "movies": message["movie"],
                        "entity": message["entity"]
                    }

                previous_sender_id = message["senderWorkerId"]

            # 마지막 메시지 추가
            if current_message:
                processed_data["dialog"].append(current_message)


            fout.write(json.dumps(processed_data, ensure_ascii=False) + '\n')


In [None]:
f = open(os.path.join(dpath, 'test_data_dbpedia.jsonl'), 'r', encoding='utf-8')
test_data = [json.loads(line) for line in f]

In [39]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

preprocess('train_data_dbpedia.jsonl', 'senti_train_data_processed.jsonl', dpath)
preprocess('valid_data_dbpedia.jsonl', 'senti_valid_data_processed.jsonl', dpath)
preprocess('test_data_dbpedia.jsonl', 'senti_test_data_processed.jsonl', dpath)

# with open(args.dataset + 'item_ids.json', 'w', encoding='utf-8') as f:
#     json.dump(list(item_set), f, ensure_ascii=False)
# print(f'#item: {len(item_set)}')

9006it [00:01, 8789.51it/s]
1000it [00:00, 9769.12it/s]
1342it [00:00, 10029.48it/s]


In [36]:
# 예시
exm = {"conv_id": "20001", "dialog": [{"utt_id": 0, "role": "Seeker", "text": "Hi I am looking for a movie like @111776", "movies": ["<http://dbpedia.org/resource/Super_Troopers>"], "entity": []}, {"utt_id": 1, "role": "Recommender", "text": "You should watch @151656", "movies": ["<http://dbpedia.org/resource/Police_Academy_(film)>"], "entity": ["<http://dbpedia.org/resource/Israel>"]}, {"utt_id": 2, "role": "Seeker", "text": "Is that a great one? I have never seen it. I have seen @192131 I mean @134643", "movies": ["<http://dbpedia.org/resource/American_Pie_(film)>", "<http://dbpedia.org/resource/American_Pie_(film)>"], "entity": []}, {"utt_id": 3, "role": "Recommender", "text": "Yes @151656 is very funny and so is @94688", "movies": ["<http://dbpedia.org/resource/Police_Academy_(film)>", "<http://dbpedia.org/resource/Police_Academy_2:_Their_First_Assignment>"], "entity": ["<http://dbpedia.org/resource/Humour>"]}, {"utt_id": 4, "role": "Seeker", "text": "It sounds like I need to check them out", "movies": [], "entity": []}, {"utt_id": 5, "role": "Recommender", "text": "yes you will enjoy them", "movies": [], "entity": []}, {"utt_id": 6, "role": "Seeker", "text": "I appreciate your time. I will need to check those out. Are there any others you would recommend?", "movies": [], "entity": []}, {"utt_id": 7, "role": "Recommender", "text": "yes @101794", "movies": ["<http://dbpedia.org/resource/Lethal_Weapon>"], "entity": []}, {"utt_id": 8, "role": "Seeker", "text": "Thank you i will watch that too", "movies": [], "entity": []}, {"utt_id": 9, "role": "Recommender", "text": "and also @91481", "movies": ["<http://dbpedia.org/resource/Beverly_Hills_Cop>"], "entity": []}, {"utt_id": 10, "role": "Seeker", "text": "Thanks for the suggestions.", "movies": [], "entity": []}, {"utt_id": 11, "role": "Recommender", "text": "you are welcome and also @124771", "movies": ["<http://dbpedia.org/resource/48_Hrs.>"], "entity": []}, {"utt_id": 12, "role": "Seeker", "text": "thanks goodbye", "movies": [], "entity": []}]}

exm

{'conv_id': '20001',
 'dialog': [{'utt_id': 0,
   'role': 'Seeker',
   'text': 'Hi I am looking for a movie like @111776',
   'movies': ['<http://dbpedia.org/resource/Super_Troopers>'],
   'entity': []},
  {'utt_id': 1,
   'role': 'Recommender',
   'text': 'You should watch @151656',
   'movies': ['<http://dbpedia.org/resource/Police_Academy_(film)>'],
   'entity': ['<http://dbpedia.org/resource/Israel>']},
  {'utt_id': 2,
   'role': 'Seeker',
   'text': 'Is that a great one? I have never seen it. I have seen @192131 I mean @134643',
   'movies': ['<http://dbpedia.org/resource/American_Pie_(film)>',
    '<http://dbpedia.org/resource/American_Pie_(film)>'],
   'entity': []},
  {'utt_id': 3,
   'role': 'Recommender',
   'text': 'Yes @151656 is very funny and so is @94688',
   'movies': ['<http://dbpedia.org/resource/Police_Academy_(film)>',
    '<http://dbpedia.org/resource/Police_Academy_2:_Their_First_Assignment>'],
   'entity': ['<http://dbpedia.org/resource/Humour>']},
  {'utt_id': 4

### 2. data_process 
- output: `senti_redial.py`
- UniCRS > src/data/redial/process.py 참고
- speaker 표시 ()

In [54]:
import json
import os


def reformat_data(input_file, output_file, dpath):
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        unique_conv_id = 0

        for line in tqdm(fin):
            conversation = json.loads(line)
            contexts = []
            entities = []

            for message in conversation["dialog"]:
                role_prefix = "User: " if message["role"] == "Seeker" else "System: "
                formatted_text = role_prefix + message["text"]

                # Update context and entities
                contexts.append(formatted_text)
                entity_ids = [entity2id[entity] for entity in message["entity"] if entity in entity2id]
                entities.extend(entity_ids)
                
                if message["role"] == "Seeker":
                    # Write the current context and entities
                    fout.write(json.dumps({
                        "conv_id": conversation["conv_id"],
                        "unique_conv_id": str(unique_conv_id),
                        "contexts": contexts,
                        "entities": list(entities)
                    }, ensure_ascii=False) + '\n')
                    
                    # Prepare for the next context
                    unique_conv_id += 1
                    #context = [formatted_text]  # Start new context with current Seeker's utterance
                    entities = [] # Reset entities for the new Seeker


In [55]:
dpath = "tmp_data/redial/"

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

reformat_data('senti_train_data_processed.jsonl', 'reformatted_senti_train_data_processed.jsonl', dpath)
reformat_data('senti_valid_data_processed.jsonl', 'reformatted_senti_valid_data_processed.jsonl', dpath)
reformat_data('senti_test_data_processed.jsonl', 'reformatted_senti_test_data_processed.jsonl', dpath)


9006it [00:01, 8339.31it/s]
1000it [00:00, 11477.09it/s]
1342it [00:00, 12972.47it/s]


In [1]:
ex1 = {"conv_id": "20675", "unique_conv_id": "608", "context": ["System: Hello, how are you? What type of movies do you like?", "User: Hi! Good I like horror or thriller movies. Any recommendations?"], "entity": [16988, 22458, 19559]}
ex2 = {"conv_id": "20675", "unique_conv_id": "609", "context": ["System: Hello, how are you? What type of movies do you like?", "User: Hi! Good I like horror or thriller movies. Any recommendations?", "System: I watched a good horror movies the other day. Have you seen @103175 ?", "User: No, but I have seen @77161 and @130591 . I will have to see that."], "entity": [16988]}
ex3 = {"conv_id": "20675", "unique_conv_id": "610", "context": ["System: Hello, how are you? What type of movies do you like?", "User: Hi! Good I like horror or thriller movies. Any recommendations?", "System: I watched a good horror movies the other day. Have you seen @103175 ?", "User: No, but I have seen @77161 and @130591 . I will have to see that.", "System: Have you seen @100070 ? It's one of my favorite thriller movies.", "User: No, I have never heard of that! Sounds cool. Any other Thriller movies?"], "entity": [22458, 19559, 2959, 22458, 19559]}
ex4 = {"conv_id": "20675", "unique_conv_id": "611", "context": ["System: Hello, how are you? What type of movies do you like?", "User: Hi! Good I like horror or thriller movies. Any recommendations?", "System: I watched a good horror movies the other day. Have you seen @103175 ?", "User: No, but I have seen @77161 and @130591 . I will have to see that.", "System: Have you seen @100070 ? It's one of my favorite thriller movies.", "User: No, I have never heard of that! Sounds cool. Any other Thriller movies?", "System: Perhaps my all time favorite is @187310 . Have you seen that?", "User: Yes. I would say that's more adventure than thriller though haha. Thanks and have a good one!"], "entity": [22458]}


In [6]:
ex1, '   ', ex2, '   ', ex3, '   ', ex4

({'conv_id': '20675',
  'unique_conv_id': '608',
  'context': ['System: Hello, how are you? What type of movies do you like?',
   'User: Hi! Good I like horror or thriller movies. Any recommendations?'],
  'entity': [16988, 22458, 19559]},
 '   ',
 {'conv_id': '20675',
  'unique_conv_id': '609',
  'context': ['System: Hello, how are you? What type of movies do you like?',
   'User: Hi! Good I like horror or thriller movies. Any recommendations?',
   'System: I watched a good horror movies the other day. Have you seen @103175 ?',
   'User: No, but I have seen @77161 and @130591 . I will have to see that.'],
  'entity': [16988]},
 '   ',
 {'conv_id': '20675',
  'unique_conv_id': '610',
  'context': ['System: Hello, how are you? What type of movies do you like?',
   'User: Hi! Good I like horror or thriller movies. Any recommendations?',
   'System: I watched a good horror movies the other day. Have you seen @103175 ?',
   'User: No, but I have seen @77161 and @130591 . I will have to see

### 3. PLM tokenizing & convert to id
- `dataset_senti.py`
- UniCRS > dataset_pre/rec.py 참고
- What To Do
  - PLM tokenizer 불러와서 토큰화 및 id 변환
  - Dataset 및 DataCollator 생성 => DataLoader 생성

In [None]:
import json
import os
from collections import defaultdict

import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer

from utils import padded_tensor

class CRSSentiDataset(Dataset): 
    def __init__(
        self, dataset, split, tokenizer, debug=False,
        context_max_length=None
    ):
        super(CRSSentiDataset, self).__init__()
        #self.debug = debug
        self.tokenizer = tokenizer
        self.prompt_tokenizer = prompt_tokenizer
        self.use_resp = use_resp

        self.context_max_length = context_max_length
        if self.context_max_length is None:
            self.context_max_length = self.tokenizer.model_max_length

        dataset_dir = os.path.join('data', dataset)
        data_file = os.path.join(dataset_dir, f'reformat_senti_{split}_data_processed.jsonl')
        self.data = []
        self.prepare_data(data_file)
        
        def prepare_data(self, data_file):
            with open(data_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                if self.debug:
                    lines = lines[:1024]

                for line in tqdm(lines):
                    dialog = json.loads(line)

In [None]:
reformated_test = []
data_file = os.path.join(dpath, 'new_reformatted_senti_test_data_processed.jsonl')

# with open(data_file, 'r', encoding='utf-8') as f:
#         lines = f.readlines()
#         reformated_test.append(lines)

In [None]:
reformated_test = []
data_file = os.path.join(dpath, 'new_reformatted_senti_test_data_processed.jsonl')

result_data = []
def prepare_data(data_file):
    with open(data_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        # if self.debug:
        #     lines = lines[:1024]

        context = ''
        
        for line in tqdm(lines):
            dialog = json.loads(line)
            
            for i, utt in enumerate(dialog['contexts']):
                context += utt
                context += "[EOS]" #self.tokenizer.eos_token
            
            context_ids = context.split() #self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(context))
            #context_ids = context_ids[-self.max_length:]
            
            data = {
                "conv_id": dialog["conv_id"],
                "uni_conv_id": dialog["unique_conv_id"],
                "context": context_ids,
                "entities": dialog["entities"],
                # "senti": list()
            }
            result_data.append(data)
            
            if dialog['conv_id'] == '20041':
                break
        
# def __getitem__(self, ind):
#         return self.data[ind]

# def __len__(self):
#         return len(self.data)

- dialog 예시



  dialog 1: {'**conv_id**': '20001', '**unique_conv_id**': '0', '**contexts**': ['User: Hi I am looking for a movie like @111776'], '**entities**': []}

  
  dialog 2: {"**conv_id**": "20001", "**unique_conv_id**": "1", "**contexts**": ["User: Hi I am looking for a movie like @111776", "**System**: You should watch @151656", "**User**: Is that a great one? I have never seen it. I have seen @192131 I mean @134643"], "**entities**": [24047]}


## Aspect Sentiment Classification 을 위한 데이터 전처리

### 0. redial_movie_id2_entityid
- 데이터의 "@숫자"의 영화 id를 entity2id 와 매핑하기

In [340]:
import json

def load_all_data():
    all_data = dict()
    for data_split in ['train', 'valid', 'test']:
        json_data = load_json("/home/hyuns6100/[4]newCRS/data/redial/", data_split)
        all_data[data_split] = json_data
    
    return all_data
    
def load_json(path, split):
    data = []
    with open(path + f"filled_{split}_data_dbpedia_raw.jsonl", 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

all_data = load_all_data()


In [341]:
all_data = all_data['train'] + all_data['valid'] + all_data['test']
len(all_data)

11348

In [368]:
import re 

############################################ 
# redial 의 movie id 를 entity2id (dbpeida link를 이용하여) 와 매핑
# 데이터 손수 확인 필요..
############################################

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

redial_movie_id2_entityid = {}

for conv in all_data: 
    for message in conv["messages"]:
        # message: "text", "movie", "movie_name"
        
        # 메시지에서 영화 ID 추출
        movie_ids = re.findall(r'@(\d+)', message["text"])
        
        if len(movie_ids) > 0 and len(message["movie"]) == 0:
            continue
        
        for movie_id in movie_ids: # @숫자 형식의 영화 id
            movie_title = conv["movieMentions"][movie_id]
            
            # 연속 공백 => 하나의 공백 / 맨 뒤 공백 제거
            movie_title = re.sub(r'\s+', ' ', movie_title).strip()
            
            try:
                mv_idx = message["movie_name"].index(movie_title)
                db_url = message["movie"][mv_idx]
                mv_entity_id = entity2id[db_url]
            
                if not movie_id in redial_movie_id2_entityid:        
                    redial_movie_id2_entityid[movie_id] = mv_entity_id
            except:
                continue
            
            # total_movie = len(message["movie"])
            
            # for mv_idx in range(total_movie): # 0, 1, ...
            #     ori_movie_id = 
                
            
            # if len(message['movie']) == 0:
            #     continue
            
            # # "movies" 리스트 길이가 1이면 이름 매칭 없이 entity2id 사용
            # if len(message["movie"]) == 1:
            #     dbpedia_url = message["movie"][0]
            #     dbpedia_id = entity2id[dbpedia_url] #dbpedia_id = entity2id.get(dbpedia_url)
            #     redial_movie_id2_entityid[movie_id] = dbpedia_id
                
            # else:
            #     # "movies" 리스트 길이가 1보다 크면 이름 매칭 수행
            #     movie_title = conv["movieMentions"].get(movie_id)
            #     if movie_title:
            #         # 영화 제목을 단어별로 분리
            #         title_words = set(movie_title.split())
                    
            #         # 영화 DBpedia URL과 비교
            #         best_match = None
            #         best_match_count = 0
                    
            #         for dbpedia_url in message["movie"]: # "<http://dbpedia.org/resource/Ghostbusters>"
            #             dbpedia_id = entity2id[dbpedia_url]
                        
            #             #dbpedia_words = set(dbpedia_url.split('/')[-1].replace('_', ' ').split())
            #             #dbpedia_words = set(dbpedia_url.split('/')[-1].replace('>', '').replace('_', ' ').split())
            #             dbpedia_words = set(re.sub(r'[.,><_]', ' ', dbpedia_url.split('/')[-1]).split())
                        
            #             # 교집합 단어 개수 카운트
            #             match_count = len(title_words.intersection(dbpedia_words))
            #             if match_count > best_match_count:
            #                 best_match = dbpedia_id
            #                 best_match_count = match_count
                        
            #         # 매칭 후에
            #         redial_movie_id2_entityid[movie_id] = best_match
            #             # # 교집합을 사용하여 일치하는 영화 찾기
            #             # if title_words.intersection(dbpedia_words):
            #             #     dbpedia_id = entity2id[dbpedia_url]
            #             #     #dbpedia_id = entity2id.get(dbpedia_url)
            #             #     redial_movie_id2_entityid[movie_id] = dbpedia_id
            #             #     break

In [371]:
with open("/home/hyuns6100/[4]newCRS/data/redial/new_redial_movie_id2_entityid.jsonl", "w") as f:
    json.dump(redial_movie_id2_entityid, f)

In [373]:
entityid_redial_movie_id = {v:k for k, v in redial_movie_id2_entityid.items()}

with open("/home/hyuns6100/[4]newCRS/data/redial/new_entityid_redial_movie_id.jsonl", "w") as f:
    json.dump(entityid_redial_movie_id, f)

### 1. conv_id와 redial의 user의 sentiment polarity 통합

- redial 데이터의 initialQuestions 를 바탕으로 conv_id로 매핑하여 영화에 대한 user의 sentiment polarity 통합

In [384]:
import json
import os
import re
import html
from collections import defaultdict

label_map = {0:"dislike", 1: "like", 2: "unknown"}

movie_pattern = re.compile(r'@\d+')

def process_utt(utt, movieid2name, replace_movieId):
    def convert(match):
        movieid = match.group(0)[1:]
        if movieid in movieid2name:
            movie_name = movieid2name[movieid]
            movie_name = ' '.join(movie_name.split())
            return movie_name
        else:
            return match.group(0)

    if replace_movieId:
        utt = re.sub(movie_pattern, convert, utt)
    utt = ' '.join(utt.split())
    utt = html.unescape(utt)

    return utt

check_dict = defaultdict(dict)

def asc_process_data(input_file, output_file, dpath):
    global check_dict
    
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        unique_conv_id = 0
        cnt = 0

        for line in tqdm(fin):
            conversation = json.loads(line)
            contexts = []
            aspects = defaultdict(dict)
            polarity = defaultdict(dict)
            dialogs = conversation["dialog"]
            movieid2name = conversation["movieMentions"]
            seekerPolarity = conversation["seekerPolarity"]

            for i, message in enumerate(dialogs):
                # @movie_id => movie_name
                utt = process_utt(message["text"], movieid2name, replace_movieId=True)
                
                role_prefix = "User: " if message["role"] == "Seeker" else "System: "
                formatted_text = role_prefix + utt

                # Update context and entities
                contexts.append(formatted_text)
                # contexts.extend(["</s>"])
                
                # if len(message["movies"]) != 0:
                for movie in message["movies"]:
                    if movie in entity2id:
                        entity_mv_id = entity2id[movie]
                        # redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                        try:
                            redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                            movie_title = movieid2name[redial_mv_id]
                        except:
                            # # redial_mv_id = re.findall(r'@(\d+)', message["text"])
                            # print(conversation["conv_id"], message["utt_id"], "entityid:", entity_mv_id, "redialid:", redial_mv_id)
                            for k, v in new_redial_movie_id2_entityid.items():
                                if v == entity_mv_id:
                                    try: # 동일 entity id가 여러 key (original redial movie id)에 매핑되었을 수도 있음
                                        movie_title = movieid2name[k]
                                        redial_mv_id = k
                                    except:
                                        continue
                                    
                        aspects[entity_mv_id] = movie_title
                        polarity[entity_mv_id] = seekerPolarity[redial_mv_id]['liked']
                
                if message["role"] == "Seeker" and len(aspects) != 0:
                    try:
                        assert len(set(aspects)) == len(set(polarity))
                    except:
                        print("길이 불일치: ", conversation["conv_id"], message["utt_id"])
                    
                    # Write the current context and entities
                    fout.write(json.dumps({
                        "conv_id": conversation["conv_id"],
                        "uni_conv_id": str(unique_conv_id),
                        "contexts": contexts,
                        "aspects": aspects,
                        "polarity": polarity
                    }, ensure_ascii=False) + '\n')
                    
                    # Prepare for the next context
                    unique_conv_id += 1
                    contexts = [] # 누적 X
                    aspects = defaultdict(dict) # Reset entities for the new Seeker
                    polarity = defaultdict(dict)
                    
                    
                    
                    # entity_movie_ids = [entity2id[movie] for movie in message["movies"] if movie in entity2id]
                    # redial_movie_ids = [new_entityid_redial_movie_id[entity_mv_id] for entity_mv_id in entity_movie_ids]
                    
                    # for i, redial_mv_id in enumerate(redial_movie_ids):
                    #     movie_title = movieid2name[redial_mv_id]
                    #     aspects[entity_movie_ids]
                    
                    
                    
                    # aspects.extend(movie_ids) # ex) "aspects": [30385, 5354, 14780]
                    
                    # # "text"에서 영화 ID 추출
                    # ori_movie_ids = re.findall(r'@(\d+)', message["text"])
                    
                    # for ori_movie_id in set(ori_movie_ids):
                    #     if ori_movie_id in seekerPolarity:
                    #         liked = seekerPolarity[ori_movie_id]["liked"]
                            
                    #         try:
                    #             movieid2entityid = redial_movie_id2_entityid[ori_movie_id]
                    #             if int(movieid2entityid) in aspects:
                    #                 polarity[int(movieid2entityid)] = liked
                    #         except:
                    #             if movieid2entityid == None:
                    #                     check_dict[cnt] = {"conv_id": conversation["conv_id"],
                    #                                         "utt_id":  message["utt_id"],
                    #                                         "ori_movie_id": ori_movie_id}
                                        
                    #                     cnt += 1
                    #                     print("movieid2entityid = None >>>> ", conversation["conv_id"], message["utt_id"], ori_movie_id)
                    #             else:
                    #                 print(conversation["conv_id"], message["utt_id"], ori_movie_id)
                                
                    #     else:
                    #         print("Can't save polarity >> Conv_id: ", conversation["conv_id"])
                
                
                # if message["role"] == "Seeker" and len(aspects) != 0:
                #     try:
                #         assert len(set(aspects)) == len(set(polarity))
                #     except:
                #         print("길이 불일치: ", conversation["conv_id"], message["utt_id"])
                    
                #     # Write the current context and entities
                #     fout.write(json.dumps({
                #         "conv_id": conversation["conv_id"],
                #         "uni_conv_id": str(unique_conv_id),
                #         "contexts": contexts,
                #         "aspects": list(set(aspects)),
                #         "polarity": polarity
                #     }, ensure_ascii=False) + '\n')
                    
                #     # Prepare for the next context
                #     unique_conv_id += 1
                #     contexts = [] # 누적 X
                #     aspects = [] # Reset entities for the new Seeker
                #     polarity = defaultdict(dict)

In [383]:
for k, v in new_redial_movie_id2_entityid.items():
    if v == 29959:
        print("Ss")
        break

Ss


In [386]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

new_redial_movie_id2_entityid = json.load(
            open(os.path.join(dpath, 'new_redial_movie_id2_entityid.jsonl'), 'r', encoding='utf-8'))

new_entityid_redial_movie_id = json.load(
            open(os.path.join(dpath, 'new_entityid_redial_movie_id.jsonl'), 'r', encoding='utf-8'))


asc_process_data('senti_train_data_processed.jsonl', 'tmp_asc_train_data_processed.jsonl', dpath)
asc_process_data('senti_valid_data_processed.jsonl', 'tmp_asc_valid_data_processed.jsonl', dpath)
asc_process_data('senti_test_data_processed.jsonl', 'tmp_asc_test_data_processed.jsonl', dpath)



## 아래 결과 해석:
## 1. [길이 불일치: ~] 만 데이터로 확인해보면 됨
## 2.이외 출력된 데이터들은 대화에 나타난 redial movie는 존재하나, unicrs 에서 전처리한 entity linking에서 인식되지 못해 개수가 안맞는 것
## => 그러나 어차피 entity linking이 된 movie 들만 임베딩이 되는 것이므로 상관 x (aspect 와 그에 대한 감정 label인 polarity 개수만 맞으면 됨)

9006it [00:01, 6099.06it/s]
1000it [00:00, 6480.94it/s]
1342it [00:00, 5814.64it/s]


In [267]:
check_dict

defaultdict(dict,
            {0: {'conv_id': '20454', 'utt_id': 6, 'ori_movie_id': '152496'},
             1: {'conv_id': '21009', 'utt_id': 10, 'ori_movie_id': '153346'},
             2: {'conv_id': '21061', 'utt_id': 2, 'ori_movie_id': '163708'},
             3: {'conv_id': '21528', 'utt_id': 10, 'ori_movie_id': '163708'},
             4: {'conv_id': '21651', 'utt_id': 5, 'ori_movie_id': '153346'},
             5: {'conv_id': '21985', 'utt_id': 3, 'ori_movie_id': '153346'},
             6: {'conv_id': '21985', 'utt_id': 4, 'ori_movie_id': '153346'},
             7: {'conv_id': '21989', 'utt_id': 4, 'ori_movie_id': '153346'},
             8: {'conv_id': '21989', 'utt_id': 5, 'ori_movie_id': '153346'},
             9: {'conv_id': '21992', 'utt_id': 3, 'ori_movie_id': '153346'},
             10: {'conv_id': '21992', 'utt_id': 4, 'ori_movie_id': '153346'},
             11: {'conv_id': '22168', 'utt_id': 2, 'ori_movie_id': '153346'},
             12: {'conv_id': '22817', 'utt_id': 4, 'or

In [273]:
with open(os.path.join(dpath, 'asc_senti_test_data_processed.jsonl'), 'r', encoding='utf-8') as f:
    for line in f:
        print(json.loads(line))
        break

{'conv_id': '20001', 'uni_conv_id': '0', 'contexts': ['User: Hi I am looking for a movie like Super Troopers (2001)'], 'aspects': [18292], 'polarity': {'18292': 1}}


In [284]:
data = []

## 문장의 aspect와 그에 대한 감정 label 개수 일치하는지 확인
with open(dpath + "asc_senti_test_data_processed.jsonl", 'r') as f:
    for line in f:
        lines = json.loads(line)
        # print(lines)

        for k, check in check_dict.items():
            if lines['conv_id'] == check['conv_id']:
                if len(lines['aspects']) != len(lines['polarity']):
                    print(check)

In [199]:
## 하나의 conversation에서 코드가 어떻게 돌아가는지 확인

conversation = {"conv_id": "20001", "dialog": [{"utt_id": 0, "role": "Seeker", "text": "Hi I am looking for a movie like @111776", "movies": ["<http://dbpedia.org/resource/Super_Troopers>"], "entity": []}, {"utt_id": 1, "role": "Recommender", "text": "You should watch @151656", "movies": ["<http://dbpedia.org/resource/Police_Academy_(film)>"], "entity": ["<http://dbpedia.org/resource/Israel>"]}, {"utt_id": 2, "role": "Seeker", "text": "Is that a great one? I have never seen it. I have seen @192131 I mean @134643", "movies": ["<http://dbpedia.org/resource/American_Pie_(film)>", "<http://dbpedia.org/resource/American_Pie_(film)>"], "entity": []}, {"utt_id": 3, "role": "Recommender", "text": "Yes @151656 is very funny and so is @94688", "movies": ["<http://dbpedia.org/resource/Police_Academy_(film)>", "<http://dbpedia.org/resource/Police_Academy_2:_Their_First_Assignment>"], "entity": []}, {"utt_id": 4, "role": "Seeker", "text": "It sounds like I need to check them out", "movies": [], "entity": []}, {"utt_id": 5, "role": "Recommender", "text": "yes you will enjoy them", "movies": [], "entity": []}, {"utt_id": 6, "role": "Seeker", "text": "I appreciate your time. I will need to check those out. Are there any others you would recommend?", "movies": [], "entity": []}, {"utt_id": 7, "role": "Recommender", "text": "yes @101794", "movies": ["<http://dbpedia.org/resource/Lethal_Weapon>"], "entity": []}, {"utt_id": 8, "role": "Seeker", "text": "Thank you i will watch that too", "movies": [], "entity": []}, {"utt_id": 9, "role": "Recommender", "text": "and also @91481", "movies": ["<http://dbpedia.org/resource/Beverly_Hills_Cop>"], "entity": []}, {"utt_id": 10, "role": "Seeker", "text": "Thanks for the suggestions.", "movies": [], "entity": []}, {"utt_id": 11, "role": "Recommender", "text": "you are welcome and also @124771", "movies": ["<http://dbpedia.org/resource/48_Hrs.>"], "entity": []}, {"utt_id": 12, "role": "Seeker", "text": "thanks goodbye", "movies": [], "entity": []}], "movieMentions": {"111776": "Super Troopers (2001)", "91481": "Beverly Hills Cop (1984)", "151656": "Police Academy  (1984)", "134643": "American Pie  (1999)", "192131": "American Pie ", "124771": "48 Hrs. (1982)", "94688": "Police Academy 2: Their First Assignment (1985)", "101794": "Lethal Weapon (1987)"}, "seekerPolarity": {"111776": {"suggested": 0, "seen": 1, "liked": 1}, "91481": {"suggested": 1, "seen": 2, "liked": 2}, "151656": {"suggested": 1, "seen": 0, "liked": 1}, "134643": {"suggested": 0, "seen": 1, "liked": 1}, "192131": {"suggested": 0, "seen": 1, "liked": 1}, "124771": {"suggested": 1, "seen": 2, "liked": 2}, "94688": {"suggested": 1, "seen": 0, "liked": 1}, "101794": {"suggested": 0, "seen": 2, "liked": 2}}}
contexts = []
aspects = []
polarity = dict()
dialogs = conversation["dialog"]
movieid2name = conversation["movieMentions"]
seekerPolarity = conversation["seekerPolarity"]

print(seekerPolarity)

for i, message in enumerate(dialogs):
    # @movie_id => movie_name
    utt = process_utt(message["text"], movieid2name, replace_movieId=True)

    role_prefix = "User: " if message["role"] == "Seeker" else "System: "
    formatted_text = role_prefix + utt

    if len(message["movies"]) != 0:
        movie_ids = [entity2id[movie] for movie in message["movies"] if movie in entity2id]
        aspects.extend(movie_ids)
        
        # "text"에서 영화 ID 추출
        ori_movie_ids = re.findall(r'@(\d+)', message["text"])
        
        for ori_movie_id in set(ori_movie_ids):
            if ori_movie_id in seekerPolarity:
                liked = seekerPolarity[ori_movie_id]["liked"]
                
                movieid2entityid = redial_movie_id2_entityid[ori_movie_id]    
                polarity[movieid2entityid] = liked
            else:
                print("Can't save polarity >> Conv_id: ", conversation["conv_id"])
    
    if message["role"] == "Seeker" and len(aspects) != 0:
        
        assert len(set(aspects)) == len(set(polarity))
        # Write the current context and entities
        result ={
            "conv_id": conversation["conv_id"],
            #"unique_conv_id": str(unique_conv_id),
            "contexts": contexts,
            "aspects": list(set(aspects)),
            "polarity": polarity
        }
        
        print(result)
        # Prepare for the next context
        #unique_conv_id += 1
        contexts = [] # 누적 X
        aspects = [] # Reset entities for the new Seeker
        polarity = dict()

{'111776': {'suggested': 0, 'seen': 1, 'liked': 1}, '91481': {'suggested': 1, 'seen': 2, 'liked': 2}, '151656': {'suggested': 1, 'seen': 0, 'liked': 1}, '134643': {'suggested': 0, 'seen': 1, 'liked': 1}, '192131': {'suggested': 0, 'seen': 1, 'liked': 1}, '124771': {'suggested': 1, 'seen': 2, 'liked': 2}, '94688': {'suggested': 1, 'seen': 0, 'liked': 1}, '101794': {'suggested': 0, 'seen': 2, 'liked': 2}}
{'conv_id': '20001', 'contexts': [], 'aspects': [18292], 'polarity': {18292: 1}}
{'conv_id': '20001', 'contexts': [], 'aspects': [25170, 14261], 'polarity': {25170: 1, 14261: 1}}
{'conv_id': '20001', 'contexts': [], 'aspects': [25170, 7277], 'polarity': {14261: 1, 25170: 1}}
{'conv_id': '20001', 'contexts': [], 'aspects': [9385], 'polarity': {9385: 2}}
{'conv_id': '20001', 'contexts': [], 'aspects': [17426], 'polarity': {17426: 2}}
{'conv_id': '20001', 'contexts': [], 'aspects': [29410], 'polarity': {29410: 2}}


In [None]:
## aspects에는 영화 이름이 아닌, 해당 영화의 entity2id 가 들어있음
## aspects를 aspect_id 로 바꾸고, 기존 aspects 에 "contexts"에 들어있는 실제 영화 이름을 넣어야 됨

## 이때 aspect_id에 들어있는 영화-entityid 순서에 맞게 aspects의 이름을 넣어주자
# => 그래야 Dataset 만들때 순서에 맞게 polarity (label) 저장 가능 (aspects_id를 통해 매핑되기 때문)

## + 밑의 전처리 코드는 나중에 기존 전처리 함수와 통일시키기

## senti_{split}_data_processed.jsonl, asc_{split}_data_processed.jsonl



In [338]:
import json
import os
import re
import html
from collections import defaultdict

label_map = {0:"dislike", 1: "like", 2: "unknown"}

movie_pattern = re.compile(r'@\d+')

def process_utt(utt, movieid2name, replace_movieId):
    def convert(match):
        movieid = match.group(0)[1:] # 순서대로 추출하잖아
        if movieid in movieid2name:
            movie_name = movieid2name[movieid]
            movie_name = ' '.join(movie_name.split())
            return movie_name
        else:
            return match.group(0)

    if replace_movieId:
        utt = re.sub(movie_pattern, convert, utt)
    utt = ' '.join(utt.split())
    utt = html.unescape(utt)

    return utt


def re_asc_process_data(input_file, output_file, dpath, convid_moviementions):
    
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        for line in tqdm(fin):
            conversation = json.loads(line)
            conv_id = conversation["conv_id"]
            uni_conv_id = conversation["uni_conv_id"]
            contexts = conversation["contexts"]
            polarity = conversation["polarity"]
            #aspects_entityId = deepcopy(conversation["aspects"])
            
            aspects_dict = defaultdict(dict)

            # entityid_redial_movie_id2
            for aspect_id in conversation["aspects"]:
                movieid = entityid_redial_movie_id2[str(aspect_id)]
                try:
                    movie_name = convid_moviementions[conv_id][movieid]
                except:
                    print(conv_id, uni_conv_id, "aspect: ", aspect_id, "movieid: ", movieid)
                    break
                aspects_dict[aspect_id] = movie_name
                  
            # Write the current context and entities
            fout.write(json.dumps({
                "conv_id": conv_id,
                "uni_conv_id": uni_conv_id,
                "contexts": contexts,
                "aspects": aspects_dict,
                "polarity": polarity
            }, ensure_ascii=False) + '\n')
         

In [336]:
# split = "test"
# senti_test_data = []

# with open(dpath + f"senti_{split}_data_processed.jsonl", 'r') as f:
#     for line in f:
#         senti_test_data.append(json.loads(line))
        
# convid_moviementions = defaultdict(dict)
# for data in senti_test_data:
#     convid_moviementions[data["conv_id"]] = data["movieMentions"]

In [339]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

redial_movie_id2_entityid = json.load(
            open(os.path.join(dpath, 'redial_movie_id2_entityid.jsonl'), 'r', encoding='utf-8'))


entityid_redial_movie_id2 = json.load(
            open(os.path.join(dpath, 'entityid_redial_movie_id2.jsonl'), 'r', encoding='utf-8'))

#asc_process_data('senti_train_data_processed.jsonl', 'asc_train_data_processed.jsonl', dpath)
# asc_process_data('senti_valid_data_processed.jsonl', 'asc_valid_data_processed.jsonl', dpath)
re_asc_process_data('asc_test_data_processed.jsonl', 'check_asc_test_data_processed.jsonl', dpath, convid_moviementions)


0it [00:00, ?it/s]

4729it [00:00, 54732.42it/s]

20041 7 aspect:  12052 movieid:  165484
20047 13 aspect:  20223 movieid:  90413
20048 16 aspect:  20223 movieid:  90413
20048 17 aspect:  20223 movieid:  90413
20052 22 aspect:  20223 movieid:  90413
20052 23 aspect:  18709 movieid:  148743
20052 24 aspect:  29959 movieid:  76481
20053 25 aspect:  18709 movieid:  148743
20053 26 aspect:  29959 movieid:  76481
20054 28 aspect:  17223 movieid:  109349
20055 32 aspect:  17223 movieid:  109349
20055 33 aspect:  29959 movieid:  76481
20057 37 aspect:  20223 movieid:  90413
20058 38 aspect:  17223 movieid:  109349
20059 43 aspect:  18709 movieid:  148743
20059 44 aspect:  17223 movieid:  109349
20083 46 aspect:  20223 movieid:  90413
20095 50 aspect:  27254 movieid:  88653
20098 51 aspect:  17223 movieid:  109349
20121 64 aspect:  18709 movieid:  148743
20121 65 aspect:  17223 movieid:  109349
20121 66 aspect:  17223 movieid:  109349
20133 71 aspect:  18709 movieid:  148743
20133 75 aspect:  14842 movieid:  204963
20137 78 aspect:  17223 mov




In [354]:
convid_moviementions['20041']

{'204334': 'Happy Death Day  (2017)',
 '125431': 'Annabelle  (2014)',
 '118338': 'The Forest  (2016)',
 '205430': 'A Quiet Place (2018)',
 '132562': 'The Last House on the Left  (1972)',
 '157765': 'The Heat  (2013)',
 '111776': 'Super Troopers (2001)',
 '187061': 'Identity Thief (2013)',
 '175203': 'Hot Tub Time Machine',
 '77161': 'A Nightmare on Elm Street (1984)'}

In [357]:
d = entityid_redial_movie_id2.get("12052")
d

'165484'

In [33]:
## v1 형식 + entity도 추가 

import json
import os
import re
import html
from collections import defaultdict
from tqdm.auto import tqdm

movie_pattern = re.compile(r'@\d+')

def process_utt(utt, movieid2name, replace_movieId):
    def convert(match):
        movieid = match.group(0)[1:]
        if movieid in movieid2name:
            movie_name = movieid2name[movieid]
            movie_name = ' '.join(movie_name.split())
            return movie_name
        else:
            return match.group(0)

    if replace_movieId:
        utt = re.sub(movie_pattern, convert, utt)
    utt = ' '.join(utt.split())
    utt = html.unescape(utt)

    return utt


def re_asc_process_data(input_file, output_file, dpath):
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        unique_conv_id = 0

        for line in tqdm(fin):
            conversation = json.loads(line)
            contexts = []
            entities = defaultdict(dict) 
            aspects = defaultdict(dict)
            polarity = defaultdict(dict)
            dialogs = conversation["dialog"]
            movieid2name = conversation["movieMentions"]
            seekerPolarity = conversation["seekerPolarity"]

            for i, message in enumerate(dialogs):
                # @movie_id => movie_name
                utt = process_utt(message["text"], movieid2name, replace_movieId=True)
                
                role_prefix = "User: " if message["role"] == "Seeker" else "System: "
                formatted_text = role_prefix + utt

                # Update context and entities
                contexts.append(formatted_text)
                # contexts.extend(["</s>"])
                
                for i, entity in enumerate(message["entities"]):
                    if entity in entity2id:
                        entity_id = entity2id[entity]
                        entities[entity_id] = message["entity_names"][i]
                
                for movie in message["movies"]:
                    if movie in entity2id:
                        entity_mv_id = entity2id[movie]
                        # redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                        try:
                            redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                            movie_title = movieid2name[redial_mv_id]
                        except:
                            # # redial_mv_id = re.findall(r'@(\d+)', message["text"])
                            # print(conversation["conv_id"], message["utt_id"], "entityid:", entity_mv_id, "redialid:", redial_mv_id)
                            for k, v in new_redial_movie_id2_entityid.items():
                                if v == entity_mv_id:
                                    try: # 동일 entity id가 여러 key (original redial movie id)에 매핑되었을 수도 있음
                                        movie_title = movieid2name[k]
                                        redial_mv_id = k
                                    except:
                                        continue
                                    
                        aspects[entity_mv_id] = movie_title
                        polarity[entity_mv_id] = seekerPolarity[redial_mv_id]['liked']
                
                if message["role"] == "Seeker" and len(aspects) != 0:
                    try:
                        assert len(set(aspects)) == len(set(polarity))
                    except:
                        print("길이 불일치: ", conversation["conv_id"], message["utt_id"])
                    
                    # Write the current context and entities
                    fout.write(json.dumps({
                        "conv_id": conversation["conv_id"],
                        "uni_conv_id": str(unique_conv_id),
                        "contexts": contexts,
                        "aspects": aspects,
                        "entity": entities,
                        "polarity": polarity
                    }, ensure_ascii=False) + '\n')
                    
                    # Prepare for the next context
                    unique_conv_id += 1
                    contexts = [] # 누적 X
                    aspects = defaultdict(dict) # Reset entities for the new Seeker
                    entities = defaultdict(dict)
                    polarity = defaultdict(dict)
                    

In [36]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

new_redial_movie_id2_entityid = json.load(
            open(os.path.join(dpath, 'new_redial_movie_id2_entityid.jsonl'), 'r', encoding='utf-8'))


new_entityid_redial_movie_id2 = json.load(
            open(os.path.join(dpath, 'new_entityid_redial_movie_id.jsonl'), 'r', encoding='utf-8'))

re_asc_process_data('senti_train_data_processed.jsonl', 'asc_train_data_processed.jsonl', dpath)
re_asc_process_data('senti_valid_data_processed.jsonl', 'asc_valid_data_processed.jsonl', dpath)
re_asc_process_data('senti_test_data_processed.jsonl', 'asc_test_data_processed.jsonl', dpath)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [42]:
movie_ids = re.findall(r'@(\d+)', message["text"])
movie_ids_with_at = ['@' + id for id in movie_ids]
movie_ids_with_at

['@111776']

In [81]:
## v2 형식

import json
import os
import re
import html
from collections import defaultdict
from tqdm.auto import tqdm

movie_pattern = re.compile(r'@\d+')

def re_asc_process_data(input_file, output_file, dpath):
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        unique_conv_id = 0

        for line in tqdm(fin):
            conversation = json.loads(line)
            contexts = []
            entities = defaultdict(dict) 
            aspects = defaultdict(dict)
            polarity = defaultdict(dict)
            dialogs = conversation["dialog"]
            movieid2name = conversation["movieMentions"]
            seekerPolarity = conversation["seekerPolarity"]

            for i, message in enumerate(dialogs):
                # @movie_id => movie_name
                utt = message["text"]
                
                role_prefix = "User: " if message["role"] == "Seeker" else "System: "
                formatted_text = role_prefix + utt

                # Update context and entities
                contexts.append(formatted_text)
                # contexts.extend(["</s>"])
                
                for i, entity in enumerate(message["entities"]):
                    if entity in entity2id:
                        entity_id = entity2id[entity]
                        entities[entity_id] = message["entity_names"][i]
                
                for movie in message["movies"]:                    
                    if movie in entity2id:
                        entity_mv_id = entity2id[movie]

                        # movie_ids = re.findall(r'@(\d+)', message["text"])
                        # movie_ids_with_at = ['@' + id for id in movie_ids]
                        
                        try:
                            redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                            movie_title = movieid2name[redial_mv_id]
                        except:
                            # # redial_mv_id = re.findall(r'@(\d+)', message["text"])
                            # print(conversation["conv_id"], message["utt_id"], "entityid:", entity_mv_id, "redialid:", redial_mv_id)
                            for k, v in new_redial_movie_id2_entityid.items():
                                if v == entity_mv_id:
                                    try: # 동일 entity id가 여러 key (original redial movie id)에 매핑되었을 수도 있음
                                        movie_title = movieid2name[k]
                                        redial_mv_id = k
                                    except:
                                        continue
                                    
                        #real_mv_name = movies_with_mentions[movies_with_mentions["movieId"] == int(redial_mv_id)]["movieName"]
                        
                        # if real_mv_name.iloc[0] == movie_title:
                        aspects[entity_mv_id] = "@"+redial_mv_id
                        polarity[entity_mv_id] = seekerPolarity[redial_mv_id]['liked']
                
                if message["role"] == "Seeker" and len(aspects) != 0:
                    try:
                        assert len(set(aspects)) == len(set(polarity))
                    except:
                        print("길이 불일치: ", conversation["conv_id"], message["utt_id"])
                    
                    # Write the current context and entities
                    fout.write(json.dumps({
                        "conv_id": conversation["conv_id"],
                        "uni_conv_id": str(unique_conv_id),
                        "contexts": contexts,
                        "aspects": aspects,
                        "entity": entities,
                        "polarity": polarity
                    }, ensure_ascii=False) + '\n')
                    
                    # Prepare for the next context
                    unique_conv_id += 1
                    contexts = [] # 누적 X
                    aspects = defaultdict(dict) # Reset entities for the new Seeker
                    entities = defaultdict(dict)
                    polarity = defaultdict(dict)
                    

In [69]:
import pandas as pd
movies_with_mentions = pd.read_csv("/home/hyuns6100/data/redial_dataset/movies_with_mentions.csv")
name_ = movies_with_mentions[movies_with_mentions["movieId"] == 173191]["movieName"]
name_.iloc[0]

'House at the End of the Street (2012)'

In [82]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

new_redial_movie_id2_entityid = json.load(
            open(os.path.join(dpath, 'new_redial_movie_id2_entityid.jsonl'), 'r', encoding='utf-8'))


new_entityid_redial_movie_id2 = json.load(
            open(os.path.join(dpath, 'new_entityid_redial_movie_id.jsonl'), 'r', encoding='utf-8'))

re_asc_process_data('senti_train_data_processed.jsonl', 'v2_asc_train_data_processed.jsonl', dpath)
re_asc_process_data('senti_valid_data_processed.jsonl', 'v2_asc_valid_data_processed.jsonl', dpath)
re_asc_process_data('senti_test_data_processed.jsonl', 'v2_asc_test_data_processed.jsonl', dpath)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [89]:
## v3: v1 형식 + [User: aspect] 
## v4: v2 형식 + [User: aspect] 

import json
import os
import re
import html
from collections import defaultdict
from tqdm.auto import tqdm

movie_pattern = re.compile(r'@\d+')

def re_asc_process_data(input_file, output_file, dpath):
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        unique_conv_id = 0

        for line in tqdm(fin):
            conversation = json.loads(line)
            contexts = []
            entities = defaultdict(dict) 
            aspects = defaultdict(dict)
            polarity = defaultdict(dict)
            dialogs = conversation["dialog"]
            movieid2name = conversation["movieMentions"]
            seekerPolarity = conversation["seekerPolarity"]

            for i, message in enumerate(dialogs):
                # @movie_id => movie_name
                utt = message["text"]
                
                role_prefix = "User: " if message["role"] == "Seeker" else "System: "
                formatted_text = role_prefix + utt

                # Update context and entities
                contexts.append(formatted_text)
                # contexts.extend(["</s>"])
                
                for i, entity in enumerate(message["entities"]):
                    if entity in entity2id:
                        entity_id = entity2id[entity]
                        entities[entity_id] = message["entity_names"][i]
                
                for movie in message["movies"]:                    
                    if movie in entity2id:
                        entity_mv_id = entity2id[movie]

                        # movie_ids = re.findall(r'@(\d+)', message["text"])
                        # movie_ids_with_at = ['@' + id for id in movie_ids]
                        
                        try:
                            redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                            movie_title = movieid2name[redial_mv_id]
                        except:
                            # # redial_mv_id = re.findall(r'@(\d+)', message["text"])
                            # print(conversation["conv_id"], message["utt_id"], "entityid:", entity_mv_id, "redialid:", redial_mv_id)
                            for k, v in new_redial_movie_id2_entityid.items():
                                if v == entity_mv_id:
                                    try: # 동일 entity id가 여러 key (original redial movie id)에 매핑되었을 수도 있음
                                        movie_title = movieid2name[k]
                                        redial_mv_id = k
                                    except:
                                        continue
                                    
                        #real_mv_name = movies_with_mentions[movies_with_mentions["movieId"] == int(redial_mv_id)]["movieName"]
                        
                        # if real_mv_name.iloc[0] == movie_title:
                        aspects[entity_mv_id] = "User: @"+redial_mv_id
                        polarity[entity_mv_id] = seekerPolarity[redial_mv_id]['liked']
                
                if message["role"] == "Seeker" and len(aspects) != 0:
                    try:
                        assert len(set(aspects)) == len(set(polarity))
                    except:
                        print("길이 불일치: ", conversation["conv_id"], message["utt_id"])
                    
                    # Write the current context and entities
                    fout.write(json.dumps({
                        "conv_id": conversation["conv_id"],
                        "uni_conv_id": str(unique_conv_id),
                        "contexts": contexts,
                        "aspects": aspects,
                        "entity": entities,
                        "polarity": polarity
                    }, ensure_ascii=False) + '\n')
                    
                    # Prepare for the next context
                    unique_conv_id += 1
                    contexts = [] # 누적 X
                    aspects = defaultdict(dict) # Reset entities for the new Seeker
                    entities = defaultdict(dict)
                    polarity = defaultdict(dict)
                    

In [91]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

entity2id = json.load(
            open(os.path.join(dpath, 'entity2id.json'), 'r', encoding='utf-8'))

new_redial_movie_id2_entityid = json.load(
            open(os.path.join(dpath, 'new_redial_movie_id2_entityid.jsonl'), 'r', encoding='utf-8'))


new_entityid_redial_movie_id2 = json.load(
            open(os.path.join(dpath, 'new_entityid_redial_movie_id.jsonl'), 'r', encoding='utf-8'))

re_asc_process_data('senti_train_data_processed.jsonl', 'v4_asc_train_data_processed.jsonl', dpath)
re_asc_process_data('senti_valid_data_processed.jsonl', 'v4_asc_valid_data_processed.jsonl', dpath)
re_asc_process_data('senti_test_data_processed.jsonl', 'v4_asc_test_data_processed.jsonl', dpath)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
## v5 형식: Recommender 발화 통합 (미완))

## v3: v1 형식 + [User: aspect] 
## v4: v2 형식 + [User: aspect] 

import json
import os
import re
import html
from collections import defaultdict
from tqdm.auto import tqdm

movie_pattern = re.compile(r'@\d+')

def re_asc_process_data(input_file, output_file, dpath):
    with open(dpath + input_file, 'r', encoding='utf-8') as fin, open(dpath + output_file, 'w', encoding='utf-8') as fout:
        unique_conv_id = 0

        for line in tqdm(fin):
            conversation = json.loads(line)
            contexts = []
            entities = defaultdict(dict) 
            aspects = defaultdict(dict)
            polarity = defaultdict(dict)
            dialogs = conversation["dialog"]
            movieid2name = conversation["movieMentions"]
            seekerPolarity = conversation["seekerPolarity"]

            for i, message in enumerate(dialogs):
                # @movie_id => movie_name
                utt = message["text"]
                
                role_prefix = "User: " if message["role"] == "Seeker" else "System: "
                formatted_text = role_prefix + utt

                # Update context and entities
                contexts.append(formatted_text)
                # contexts.extend(["</s>"])
                
                for i, entity in enumerate(message["entities"]):
                    if entity in entity2id:
                        entity_id = entity2id[entity]
                        entities[entity_id] = message["entity_names"][i]
                
                for movie in message["movies"]:                    
                    if movie in entity2id:
                        entity_mv_id = entity2id[movie]

                        # movie_ids = re.findall(r'@(\d+)', message["text"])
                        # movie_ids_with_at = ['@' + id for id in movie_ids]
                        
                        try:
                            redial_mv_id = new_entityid_redial_movie_id[str(entity_mv_id)]
                            movie_title = movieid2name[redial_mv_id]
                        except:
                            # # redial_mv_id = re.findall(r'@(\d+)', message["text"])
                            # print(conversation["conv_id"], message["utt_id"], "entityid:", entity_mv_id, "redialid:", redial_mv_id)
                            for k, v in new_redial_movie_id2_entityid.items():
                                if v == entity_mv_id:
                                    try: # 동일 entity id가 여러 key (original redial movie id)에 매핑되었을 수도 있음
                                        movie_title = movieid2name[k]
                                        redial_mv_id = k
                                    except:
                                        continue
                                    
                        #real_mv_name = movies_with_mentions[movies_with_mentions["movieId"] == int(redial_mv_id)]["movieName"]
                        
                        # if real_mv_name.iloc[0] == movie_title:
                        aspects[entity_mv_id] = "User: @"+redial_mv_id
                        polarity[entity_mv_id] = seekerPolarity[redial_mv_id]['liked']
                
                if message["role"] == "Seeker" and len(aspects) != 0:
                    try:
                        assert len(set(aspects)) == len(set(polarity))
                    except:
                        print("길이 불일치: ", conversation["conv_id"], message["utt_id"])
                    
                    # Write the current context and entities
                    fout.write(json.dumps({
                        "conv_id": conversation["conv_id"],
                        "uni_conv_id": str(unique_conv_id),
                        "contexts": contexts,
                        "aspects": aspects,
                        "entity": entities,
                        "polarity": polarity
                    }, ensure_ascii=False) + '\n')
                    
                    # Prepare for the next context
                    unique_conv_id += 1
                    contexts = [] # 누적 X
                    aspects = defaultdict(dict) # Reset entities for the new Seeker
                    entities = defaultdict(dict)
                    polarity = defaultdict(dict)
                    

### 2. ASC task 를 위한 전처리

- 데이터 format
    - "conv_id"
    - "uni_conv_id"
    - "contexts"
    - "aspects"
    - "polarities"

In [76]:
import json
import os
from collections import defaultdict
import random

from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoTokenizer
from transformers import BertTokenizer
from tqdm.auto import tqdm

from utils import padded_tensor


class CRSASCDataset(Dataset): 
    def __init__(
        self, dpath, split, tokenizer, context_max_length=None, 
    ):
        super(CRSASCDataset, self).__init__()
        self.tokenizer = tokenizer
        self.context_max_length = context_max_length

        data_file = os.path.join(dpath, f'v2_asc_{split}_data_processed.jsonl')
        self.data = []
        self.prepare_data(data_file)
        
    def prepare_data(self, data_file):
        with open(data_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    
            for line in tqdm(lines):
                dialog = json.loads(line)
                #contexts = self.tokenizer.cls_token
                contexts = ''
                
                for i, utt in enumerate(dialog["contexts"]):
                    contexts += utt
                    contexts += self.tokenizer.eos_token # 각 role의 발화를 구분하기 위함 #self.tokenizer.eos_token # </s>
                
                contexts += self.tokenizer.sep_token
                contexts_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(contexts))
                # sentences_ids = sentences_ids[-self.context_max_length:]
                
                context_segments = [0] * len(contexts_ids)
                
                for i, mv_set in enumerate(dialog["aspects"].items()):                    
                    mv_entityid = mv_set[0]
                    mv_aspect = mv_set[1]

                    mv_aspect += self.tokenizer.sep_token
                    mv_aspect_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(mv_aspect))
                    
                    asp_segments = [1] * len(mv_aspect_ids)
                    segments = context_segments + asp_segments
                    
                    sentences_ids = contexts_ids + mv_aspect_ids
                    sentences_ids = sentences_ids[-(self.context_max_length - 1):] # max_length에 맞춰 뒷부분을 유지
                    segments = segments[-(self.context_max_length - 1):]
                    
                    sentences_ids = [self.tokenizer.cls_token_id] + sentences_ids
                    segments = [0] + segments # for [CLS] token
                    
                    assert len(sentences_ids) == len(segments)
                    
                    # mv_set[0]: mv_id
                    # mv_set[1]: mv_aspect
                    #  self.tokenizer.encode_plus(text=sentences, text_pair=mv_aspect, add_special_tokens=True,
                    #         padding='max_length', max_length=self.context_max_length, pad_to_max_length=True,
                    #         return_token_type_ids=True, return_tensors='pt')

                    data = {
                        "conv_id": dialog["conv_id"],
                        "sentence": sentences_ids,
                        "segment": segments,
                        "polarity": dialog["polarity"][mv_entityid]
                    }
                    
                    self.data.append(data)
                    
        
    def __getitem__(self, ind):
        return self.data[ind]

    def __len__(self):
        return len(self.data)

In [94]:
from torch.nn.utils.rnn import pad_sequence

class CRSASCDataCollator:
    def __init__(
        self, tokenizer, context_max_length, device
    ):
        self.tokenizer = tokenizer
        self.context_max_length = context_max_length
        self.device = device
        
    # def pad_contexts(self, contexts):     
    #     inputs = [torch.tensor(i, dtype=torch.long) for i in contexts["input_ids"]]
    #     segments = [torch.tensor(s, dtype=torch.long) for s in contexts["token_type_ids"]]
        
    #     # pad_sequence 적용
    #     inputs = pad_sequence(inputs, batch_first=True)
    #     segments = pad_sequence(segments, batch_first=True)
        
    #     # attention masks 생성
    #     attn_masks = torch.zeros(inputs.shape, dtype=torch.long)
    #     attn_masks = attn_masks.masked_fill(inputs != 0, 1)
    
    #     return inputs, segments, attn_masks
        
    def __call__(self, data_batch):
        conv_id_batch = []
        #uni_conv_id_batch = []
        context_batch = defaultdict(list)
        label_batch = []
        
        for data in data_batch: # "conv_id", "sentence", "segment", "polarity"
            conv_id_batch.append(data["conv_id"])
            #uni_conv_id_batch.append(data['uni_conv_id'])
            input_ids = data["sentence"]
            token_type_ids = data["segment"]  
            context_batch["input_ids"].append(input_ids)
            context_batch["token_type_ids"].append(token_type_ids)
            label_batch.append(data["polarity"])
            
        context_batch['labels'] = label_batch
        
        input_batch = {}
     
        #context_batch["input_ids"], context_batch["token_type_ids"], context_batch["attention_mask"] = self.pad_contexts(context_batch)
        
        context_batch = self.tokenizer.pad(
            context_batch, padding=True, max_length=self.context_max_length)
        
        for k, v in context_batch.items():
            context_batch[k] = torch.as_tensor(v, device=self.device)
           
        # 모델링 시
        # 1. **batch["context"] 로 model 인자 주고 
        # 2. 모델 forward 에서 sentiment logits 값 구하면 
        # 3. if senti_labels is not None: 코드에서 해당 logits 값 저장 (필요시 3개의 label [pos/neg/neu] 에 대한 각각의 값을 딕셔너리로 저장해야할수도~
        
        input_batch["conv_id"] = conv_id_batch
        input_batch["context"] = context_batch
        input_batch["label"] = label_batch

        return input_batch

In [75]:
import torch  
from transformers import BertTokenizer,BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
model = BertModel.from_pretrained("bert-base-uncased")

tokenizer.add_special_tokens({'eos_token': '[EOS]'})

device = 'cuda:1' if torch.cuda.is_available() else 'cpu'

print(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


cuda:1


In [95]:
dpath = "/home/hyuns6100/[4]newCRS/data/redial/"

dataset = CRSASCDataset(
        dpath=dpath, split='test', 
        tokenizer=tokenizer, context_max_length=256,
    )

data_collator = CRSASCDataCollator(
        tokenizer=tokenizer, context_max_length=256,
        device=device
    )

  0%|          | 0/4729 [00:00<?, ?it/s]

In [96]:
dataloader = DataLoader(
        dataset,
        batch_size=16,
        collate_fn=data_collator,
        shuffle=False
    )

for batch in dataloader:
    print("!")
    break

!




In [109]:
train_polarity = {"like": 0, "dislike": 0, "unknown": 0}

for batch in dataloader:
    labels = batch["context"]["labels"]
    train_polarity["dislike"] += torch.sum(labels == 0).item()
    train_polarity["like"] += torch.sum(labels == 1).item()  
    train_polarity["unknown"] += torch.sum(labels == 2).item()




In [111]:
total_samples = sum(train_polarity.values())
# 역빈도 가중치
weights = [total_samples / train_polarity["dislike"], total_samples / train_polarity["like"], total_samples / train_polarity["unknown"]]
weights

[22.270588235294117, 1.2296200064956155, 7.050279329608938]

In [113]:
weights_2 = [1 - train_polarity["dislike"] /total_samples, 1 - train_polarity["like"] /total_samples , 1 - train_polarity["unknown"] /total_samples]
weights_2

[0.9550977284733227, 0.18674062334918118, 0.8581616481774961]

In [116]:
weights_3 = [1 / train_polarity["dislike"], 1/ train_polarity["like"], 1 / train_polarity["unknown"]]
weights_3

[0.0029411764705882353, 0.00016239038648911984, 0.000931098696461825]

In [12]:
batch["context"]['input_ids'].shape, batch["context"]['token_type_ids'].shape, batch["context"]['attention_mask'].shape,

(torch.Size([16, 117]), torch.Size([16, 117]), torch.Size([16, 117]))

In [573]:
print(tokenizer.decode(batch["context"]["input_ids"][0]))
print(tokenizer.decode(batch["context"]["input_ids"][1]))

[CLS] system : hello [EOS] user : hey! [EOS] system : what genre of movies do you like? [EOS] user : i like funny movies. how about you [EOS] system : i like action movies but also like comedys, so let me recomend you a movie have you seen borat ( 2006 )? [EOS] user : thats a good one! have you seen wedding crashers ( 2005 ) [EOS] [SEP] borat ( 2006 ) [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] system : hello [EOS] user : hey! [EOS] system : what genre of movies do you like? [EOS] user : i like funny movies. how about you [EOS] system : i like action movies but also like comedys, so let me recomend you a movie have you seen borat ( 2006 )? [EOS] user : thats a good one! have you seen wedding crashers ( 2005 ) [EOS] [SEP] wedding crashers ( 2005 ) [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [577]:
print(batch["context"]["input_ids"][0])
print(batch["context"]["input_ids"][1])

tensor([  101,  2291,  1024,  7592, 30522,  5310,  1024,  4931,   999, 30522,
         2291,  1024,  2054,  6907,  1997,  5691,  2079,  2017,  2066,  1029,
        30522,  5310,  1024,  1045,  2066,  6057,  5691,  1012,  2129,  2055,
         2017, 30522,  2291,  1024,  1045,  2066,  2895,  5691,  2021,  2036,
         2066,  4038,  2015,  1010,  2061,  2292,  2033, 28667,  8462,  4859,
         2017,  1037,  3185,  2031,  2017,  2464,  8945,  8609,  1006,  2294,
         1007,  1029, 30522,  5310,  1024,  2008,  2015,  1037,  2204,  2028,
          999,  2031,  2017,  2464,  5030,  5823,  2545,  1006,  2384,  1007,
        30522,   102,  8945,  8609,  1006,  2294,  1007,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0],
       device='cuda:1')
tensor([  101,  2291,  1024,  7592, 30522,  5310,  10

In [578]:
print(batch["context"]["attention_mask"][0])
print(batch["context"]["attention_mask"][1])

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:1')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       device='cuda:1')


In [14]:
logits = torch.rand([10, 3])

_, predicted_labels = torch.max(logits, 1)

predicted_labels

tensor([1, 2, 0, 1, 1, 2, 2, 0, 1, 0])

In [15]:
torch.argmax(logits, dim=1)

tensor([1, 2, 0, 1, 1, 2, 2, 0, 1, 0])

In [22]:
asc_train = []

with open(dpath + "asc_train_data_processed.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        lines = json.loads(line)
        asc_train.append(lines)

asc_valid = []

with open(dpath + "asc_valid_data_processed.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        lines = json.loads(line)
        asc_valid.append(lines)
        
asc_test = []

with open(dpath + "asc_test_data_processed.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        lines = json.loads(line)
        asc_test.append(lines)

In [21]:
train_polarity = {"like": 0, "dislike": 0, "unknown": 0}
label_map = {0: "dislike", 1: "like", 2: "unknown"}

for data in asc_train:
    for pol in data["polarity"].values():
        train_polarity[label_map[pol]] += 1      
        
print(train_polarity)      

{'like': 41020, 'dislike': 2464, 'unknown': 6768}


In [23]:
valid_polarity = {'like': 0, 'dislike': 0, 'unknown': 0}
label_map = {0: "dislike", 1: "like", 2: "unknown"}

for data in asc_valid:
    for pol in data["polarity"].values():
        valid_polarity[label_map[pol]] += 1      
        
print(valid_polarity)

{'like': 4499, 'dislike': 264, 'unknown': 726}


In [24]:
test_polarity = {'like': 0, 'dislike': 0, 'unknown': 0}
label_map = {0: "dislike", 1: "like", 2: "unknown"}

for data in asc_test:
    for pol in data["polarity"].values():
        test_polarity[label_map[pol]] += 1      
        
print(test_polarity)      

{'like': 6158, 'dislike': 340, 'unknown': 1074}
