In [1]:
import os
import logging
import argparse
from tqdm import tqdm, trange
from datetime import datetime
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import AutoModelForTokenClassification
import predict
from utils import init_logger, load_tokenizer, get_labels
import easydict
import re
WEEKDAY = {0:"월요일",1:"화요일",2:"수요일",3:"목요일",4:"금요일",5:"토요일",6:"일요일"}

In [2]:
def findFirstSecond(arr):
    second = first = -float('inf')
    second_i = first_i = 0
    for i,n in enumerate(arr):
        if n > first:
            second = first
            first = n
            second_i = first_i
            first_i = i
        elif second < n < first:
            second = n
            second_i = i
    return first_i,second_i

In [3]:
pred_config = easydict.EasyDict({
    "input_file":"tel6.txt",
    "output_file":"tel6_out.txt",
    "model_dir":"./model",
    "batch_size":32,
    "no_cuda":"store_true"
})

In [4]:
pred_config

{'input_file': 'tel6.txt',
 'output_file': 'tel6_out.txt',
 'model_dir': './model',
 'batch_size': 32,
 'no_cuda': 'store_true'}

## predict

In [5]:
args = predict.get_args(pred_config)
device = predict.get_device(pred_config)
model = predict.load_model(pred_config, args, device)
label_lst = get_labels(args)

In [6]:
pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index

In [7]:
tokenizer = load_tokenizer(args)

In [8]:
args

Namespace(adam_epsilon=1e-08, data_dir='./data', do_eval=True, do_train=True, eval_batch_size=64, gradient_accumulation_steps=1, label_file='label.txt', learning_rate=5e-05, logging_steps=1000, max_grad_norm=1.0, max_seq_len=50, max_steps=-1, model_dir='./model', model_name_or_path='monologg/kobert', model_type='kobert', no_cuda=False, num_train_epochs=20.0, pred_dir='./preds', save_steps=1000, seed=42, task='naver-ner', test_file='test.tsv', train_batch_size=32, train_file='train.tsv', warmup_steps=0, weight_decay=0.0, write_pred=False)

In [9]:
pred_config["input_file"] = "tel5.txt"

In [10]:
lines = predict.read_input_file(pred_config)

In [11]:
dataset = predict.convert_input_file_to_tensor_dataset(lines, pred_config, args, tokenizer, pad_token_label_id)

In [12]:
sampler = SequentialSampler(dataset)
data_loader = DataLoader(dataset, sampler=sampler, batch_size=pred_config.batch_size)
all_slot_label_mask = None
preds = None

In [13]:
for batch in tqdm(data_loader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
                      "labels": None}
            if args.model_type != "distilkobert":
                inputs["token_type_ids"] = batch[2]
            outputs = model(**inputs)
            logits = outputs[0]

            if preds is None:
                preds = logits.detach().cpu().numpy()
                all_slot_label_mask = batch[3].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                all_slot_label_mask = np.append(all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0)
        

Predicting: 100%|██████████| 1/1 [00:00<00:00,  2.92it/s]


In [14]:
first_pred = []
second_pred = []
for i in range(preds.shape[0]):
    first_pred.append([])
    second_pred.append([])
    for j in range(preds.shape[1]):
        first,second = findFirstSecond(preds[i][j])
        first_pred[i].append(first)
        second_pred[i].append(second)
first_pred = np.array(first_pred)
second_pred = np.array(second_pred)

In [15]:
#최대만 찾기

# slot_label_map = {i: label for i, label in enumerate(label_lst)}
# preds_list = [[] for _ in range(first_pred.shape[0])]

# for i in range(first_pred.shape[0]):
#     for j in range(first_pred.shape[1]):
#         if all_slot_label_mask[i, j] != pad_token_label_id:
#             preds_list[i].append(slot_label_map[first_pred[i][j]])

In [16]:
# 두번째만 찾기

# slot_label_map = {i: label for i, label in enumerate(label_lst)}
# preds_list = [[] for _ in range(second_pred.shape[0])]

# for i in range(second_pred.shape[0]):
#     for j in range(second_pred.shape[1]):
#         if all_slot_label_mask[i, j] != pad_token_label_id:
#             preds_list[i].append(slot_label_map[second_pred[i][j]])

In [17]:
#1,2번째 큰 확률 다 반영

slot_label_map = {i: label for i, label in enumerate(label_lst)}
preds_list = [[] for _ in range(first_pred.shape[0])]
for_loc_list = [[] for _ in range(first_pred.shape[0])]

for i in range(first_pred.shape[0]):
    for j in range(first_pred.shape[1]):
        if all_slot_label_mask[i, j] != pad_token_label_id:
            if first_pred[i][j] not in [16,17] and second_pred[i][j] in [16,17]:
                preds_list[i].append(slot_label_map[second_pred[i][j]])
            else:
                preds_list[i].append(slot_label_map[first_pred[i][j]])
                
            if first_pred[i][j] not in [8,9,10,11] and second_pred[i][j] in [8,9,10,11]:
                for_loc_list[i].append(slot_label_map[second_pred[i][j]])
            else:
                for_loc_list[i].append(slot_label_map[first_pred[i][j]])

In [18]:
preds_list

[['O', 'O', 'ORG-B', 'ORG-B', 'O', 'PER-B', 'CVL-B', 'O'],
 ['O', 'O', 'DAT-B', 'O', 'NUM-B', 'NUM-I', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'DAT-B',
  'TIM-B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'NUM-B',
  'NUM-I',
  'O',
  'O'],
 [],
 ['O', 'DAT-B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 [],
 ['O', 'O'],
 [],
 ['DAT-B', 'O', 'O', 'O'],
 [],
 ['O',
  'DAT-B',
  'TIM-B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'CVL-B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O'],
 ['O', 'O'],
 [],
 ['NUM-B', 'NUM-I', 'NUM-B', 'NUM-B', 'NUM-I', 'O', 'O', 'O'],
 [],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 [],
 ['O', 'O'],
 [],
 ['O', 'O']]

In [19]:
date = []
time = []
date_time = {}
loc = []
second_loc = []
for i,wp in enumerate(zip(lines, preds_list)):
            date_time[i] = []
            for j,(word, p) in enumerate(zip(wp[0], wp[1])):
                #B-I를 같은 리스트에 담아서 연결성 up    ex) 내일 오전 어떠세요? 10시 좋아요
                if p == 'DAT-B':
                    date.append(word)
                    date_time[i].append(word)
                elif p == 'DAT-I':
                    date.append(word)
                    date_time[i].append(word)
                elif p == 'TIM-B':
                    time.append(word)
                    date_time[i].append(word)
                elif p == 'TIM-I':
                    time.append(word)
                    date_time[i].append(word)
                elif p == 'LOC-B':
                    date_time[i].append(word)
                    if preds_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
                        loc[-1] = loc[-1] + " "+ word
                    else:
                        loc.append(word)
                elif p == 'LOC-I':
                    date_time[i].append(word)
                    loc[-1] = loc[-1] + " "+ word
                elif p == 'ORG-B':
                    date_time[i].append(word)
                    if preds_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
                        loc[-1] = loc[-1] + " "+ word
                    else:
                        loc.append(word)
                elif p == 'ORG-I':
                    date_time[i].append(word)
                    loc[-1] = loc[-1] + " "+ word
                elif (p == 'NUM-B'or p == 'NUM-I')and "시" in word:
                    time.append(word)
                    date_time[i].append(word)
                elif "반" in word and preds_list[i][j-1] in ['TIM-B','TIM-I']:
                    date_time[i].append(word)
                    time.append(word)
                

In [21]:
now = datetime.now()
year = now.year
month = now.month
day = now.day
date_fix = 0
hour = now.hour
hour_sub = 0   # 없으면 0 오전 1 오후 2
hour_back = 0
hour_flag = 0
minute = 0
minute_sub = ""
weekday = now.weekday()
next_week = 0
next_day = 0
isWeekday= 0

In [22]:
for t in time:
    if "오전" in t:
        hour_sub = 1
    elif "오후" in t:
        hour_sub = 2
    elif "뒤" in t:
        hour_back = re.sub(r'[^0-9]', '', t)
        minute = now.minute
    elif "후에" in t:
        hour_back = re.sub(r'[^0-9]', '', t)
        minute = now.minute
    elif "시" in t:
        if re.search('\d',t):
            hour = re.sub(r'[^0-9]', '', t)  
        elif "한시" in t:
            hour = 1
            hour_flag = 1
        elif "두시" in t :
            hour = 2
            hour_flag = 1
        elif "세시" in t :
            hour = 3
            hour_flag = 1
        elif "네시" in t:
            hour = 4
            hour_flag = 1
        elif "다섯시" in t :
            hour = 5
            hour_flag = 1
        elif "여섯시" in t:
            hour = 6
            hour_flag = 1
        elif "일곱시" in t :
            hour = 7
            hour_flag = 1
        elif "여덜시" in t :
            hour = 8
        elif "아홉시" in t:
            hour = 9
        elif "열시" in t:
            hour = 10
        elif "열한시" in t:
            hour = 11
        elif "열두시" in t:
            hour = 12
    elif "분" in t:
        minute = re.sub(r'[^0-9]', '', t)
    elif "반" in t:
        minute = 30

if hour_sub == 2 and hour_flag == 1:
    hour += 12
if hour_back != 0:
    hour += hour_back


In [23]:
# 토큰화해서 조사인지 확인하고 조사라면 떼어내기
# 근데 이게 경기도 같은거에서 도가 빠지는지 확인.....
# for i in range(20):
#     print(tokenizer.tokenize(lines[51][i]))

In [24]:
for d in date:
    if "다음주" in d or "다음 주" in d:
        next_week = 1
    elif "이번주" in d:
        next_week = 0
    elif "내일" in d and next_day == 0:
        day += 1
        weekday += 1
        next_day = 1
    elif "오늘" in d:
        day = now.day
        weekday = now.weekday()
    elif "월요일" in d:
        promise_week = 0
        isWeekday = 1
    elif "화요일" in d:
        promise_week = 1
        isWeekday = 1
    elif "수요일" in d :
        promise_week = 2
        isWeekday = 1
    elif "목요일" in d: 
        promise_week = 3
        isWeekday = 1
    elif "금요일" in d:
        promise_week = 4
        isWeekday = 1
    elif "토요일" in d:
        promise_week = 5
        isWeekday = 1
    elif "일요일" in d:
        promise_week = 5
        isWeekday = 1
    elif "일" in d and re.search('\d',d):
        day = re.sub(r'[^0-9]', '', d)
        date_fix = 1
    elif "월" in d and re.search('\d',d):
        month = re.sub(r'[^0-9]', '', d)
    elif "년" in d and re.search('\d',d):
        year = re.sub(r'[^0-9]', '', d)
        if year < 2023:
            year = 2023

#대화에 요일이 있을 때
if isWeekday==1:
    if weekday > promise_week or next_week == 1:
        day = int(now.day) + promise_week - weekday + 7
    else:
        day = int(now.day) + promise_week - weekday
    weekday = promise_week

In [25]:
#first확률에서 약속장소가 없을경우 두번때 확률까지 탐색
# second_loc = []
# if loc == []:
#     for i,wp in enumerate(zip(lines, for_loc_list)):
#                 date_time[i] = []
#                 for j,(word, p) in enumerate(zip(wp[0], wp[1])):
#                     #B-I를 같은 리스트에 담아서 연결성 up    ex) 내일 오전 어떠세요? 10시 좋아요
#                     if p == 'DAT-B':
#                         date.append(word)
#                         date_time[i].append(word)
#                     elif p == 'DAT-I':
#                         date.append(word)
#                         date_time[i].append(word)
#                     elif p == 'TIM-B':
#                         time.append(word)
#                         date_time[i].append(word)
#                     elif p == 'TIM-I':
#                         time.append(word)
#                         date_time[i].append(word)
#                     elif p == 'LOC-B':
#                         if preds_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
#                             second_loc[-1] = second_loc[-1] + " "+ word
#                         else:
#                             second_loc.append(word)
#                     elif p == 'LOC-I':
#                         second_loc[-1] = second_loc[-1] + " "+ word
#                     elif p == 'ORG-B':
#                         if preds_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
#                             second_loc[-1] = second_loc[-1] + " "+ word
#                         else:
#                             second_loc.append(word)
#                     elif p == 'ORG-I':
#                         second_loc[-1] = second_loc[-1] + " "+ word
#                     elif (p == 'NUM-B'or p == 'NUM-I')and "시" in word:
#                         time.append(word)
#                         date_time[i].append(word)
#                     elif "반" in word and preds_list[i][j-1] in ['TIM-B','TIM-I']:
#                         time.append(word)
#     loc = second_loc

In [26]:
if loc == []:
    location = "미정"
else:
    location = loc[-1]

if day == now.day:
    if int(hour) < now.hour:
        hour = int(hour) + 12
if next_day == 1 :
    if 0 < int(hour) < 6:
        hour = int(hour) + 12
weekday = weekday % 7

In [44]:
max_i = -1
max = -1
for i in range(len(date_time)):
    if max <= len(date_time[i]):
        max = len(date_time[i])
        max_i = i


In [45]:
print("주요 통화 내용 : {} ".format(' '.join(s for s in lines[max_i])))

주요 통화 내용 : 두 세시 네 두 세시 될 것 같습니다. 


In [46]:
print("약속 장소 : {} , 약속 시간 : {}년 {}월 {}일  {}시 {}분 ({})".format(location,year,month,day,hour,minute,WEEKDAY[weekday]))

약속 장소 : 한양대 에리카 , 약속 시간 : 2023년 2월 6일  15시 0분 (월요일)
