In [1]:
import os
import logging
import argparse
from tqdm import tqdm, trange
from datetime import datetime
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import AutoModelForTokenClassification
import predict
from utils import init_logger, load_tokenizer, get_labels, MODEL_CLASSES
import easydict
import re
from transformers import PreTrainedTokenizerFast
WEEKDAY = {0:"월요일",1:"화요일",2:"수요일",3:"목요일",4:"금요일",5:"토요일",6:"일요일"}

In [2]:
def findFirstSecond(arr):
    second = first = -float('inf')
    second_i = first_i = 0
    for i,n in enumerate(arr):
        if n > first:
            second = first
            first = n
            second_i = first_i
            first_i = i
        elif second < n < first:
            second = n
            second_i = i
    return first_i,second_i

In [3]:
pred_config = easydict.EasyDict({
    "input_file":"tel6.txt",
    "output_file":"tel6_out.txt",
    "model_dir":"./model",
    "batch_size":32,
    "no_cuda":"store_true"
})

In [4]:
pred_config

{'input_file': 'tel6.txt',
 'output_file': 'tel6_out.txt',
 'model_dir': './model',
 'batch_size': 32,
 'no_cuda': 'store_true'}

## predict

In [5]:
args = predict.get_args(pred_config)
device = predict.get_device(pred_config)
model = predict.load_model(pred_config, args, device)
label_lst = get_labels(args)

In [6]:
pad_token_label_id = torch.nn.CrossEntropyLoss().ignore_index

In [7]:
#tokenizer = load_tokenizer(args)
tokenizer = MODEL_CLASSES[args.model_type][2].from_pretrained('./tokenizer')

In [8]:
tokenizer

PreTrainedTokenizer(name_or_path='./tokenizer', vocab_size=8002, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [9]:
args

Namespace(adam_epsilon=1e-08, data_dir='./data', do_eval=True, do_train=True, eval_batch_size=64, gradient_accumulation_steps=1, label_file='label.txt', learning_rate=5e-05, logging_steps=1000, max_grad_norm=1.0, max_seq_len=50, max_steps=-1, model_dir='./model', model_name_or_path='monologg/kobert', model_type='kobert', no_cuda=False, num_train_epochs=20.0, pred_dir='./preds', save_steps=1000, seed=42, task='naver-ner', test_file='test.tsv', train_batch_size=32, train_file='train.tsv', warmup_steps=0, weight_decay=0.0, write_pred=False)

In [10]:
pred_config["input_file"] = "./suggestion_test_data/tel4.txt"

In [11]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [12]:
lines = predict.read_input_file(pred_config)

In [13]:
dataset = predict.convert_input_file_to_tensor_dataset(lines, pred_config, args, tokenizer, pad_token_label_id)

In [14]:
sampler = SequentialSampler(dataset)
data_loader = DataLoader(dataset, sampler=sampler, batch_size=pred_config.batch_size)
all_slot_label_mask = None
preds = None

In [15]:
for batch in tqdm(data_loader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
                      "labels": None}
            if args.model_type != "distilkobert":
                inputs["token_type_ids"] = batch[2]
            outputs = model(**inputs)
            logits = outputs[0]

            if preds is None:
                preds = logits.detach().cpu().numpy()
                all_slot_label_mask = batch[3].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                all_slot_label_mask = np.append(all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0)
        

Predicting: 100%|██████████| 2/2 [00:00<00:00,  3.80it/s]


In [16]:
first_pred = []
second_pred = []
for i in range(preds.shape[0]):
    first_pred.append([])
    second_pred.append([])
    for j in range(preds.shape[1]):
        first,second = findFirstSecond(preds[i][j])
        first_pred[i].append(first)
        second_pred[i].append(second)
first_pred = np.array(first_pred)
second_pred = np.array(second_pred)

In [17]:
second_pred[12][0]

29

In [18]:
findFirstSecond(preds[12][1])

(18, 29)

In [19]:
#1,2번째 큰 확률 다 반영

slot_label_map = {i: label for i, label in enumerate(label_lst)}
preds_list = [[] for _ in range(first_pred.shape[0])]
for_loc_list = [[] for _ in range(first_pred.shape[0])]

for i in range(first_pred.shape[0]):
    for j in range(first_pred.shape[1]):
        if all_slot_label_mask[i, j] != pad_token_label_id:
            if first_pred[i][j] not in [16,17] and second_pred[i][j] in [16,17]:
                preds_list[i].append(slot_label_map[second_pred[i][j]])
            else:
                preds_list[i].append(slot_label_map[first_pred[i][j]])
                
            if first_pred[i][j] not in [8,9,10,11,14,15,16,17] and second_pred[i][j] in [8,9,10,11,16,17]:
                for_loc_list[i].append(slot_label_map[second_pred[i][j]])
                if i==12 and j ==0:
                    print(slot_label_map[first_pred[i][j]])
            else:
                for_loc_list[i].append(slot_label_map[first_pred[i][j]])
                

In [36]:
date = []
time = []
date_time_loc = {}
loc = []
second_loc = []
for i,wp in enumerate(zip(lines, preds_list)):
            date_time_loc[i] = []
            for j,(word, p) in enumerate(zip(wp[0], wp[1])):
                #B-I를 같은 리스트에 담아서 연결성 up    ex) 내일 오전 어떠세요? 10시 좋아요
                if p == 'DAT-B':
                    date.append(word)
                elif p == 'DAT-I':
                    if "뒤" in word or "후" in word:
                        date[-1] = date[-1]+word
                    else:
                        date.append(word)
                elif p == 'TIM-B':
                    time.append(word)
                elif p == 'TIM-I':
                    if "뒤" in word or "후" in word:
                        time[-1] = time[-1]+word
                    else:
                        time.append(word)
                elif p == 'LOC-B':
                    if preds_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
                        loc[-1] = loc[-1] + " "+ word
                    else:
                        loc.append(word)
                elif p == 'LOC-I':
                    loc[-1] = loc[-1] + " "+ word
                elif p == 'ORG-B':
                    if preds_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
                        loc[-1] = loc[-1] + " "+ word
                    else:
                        loc.append(word)
                elif p == 'ORG-I':
                    loc[-1] = loc[-1] + " "+ word
                elif (p == 'NUM-B'or p == 'NUM-I')and "시" in word:
                    time.append(word)
                elif "반" in word and preds_list[i][j-1] in ['TIM-B','TIM-I']:
                    time.append(word)
                elif "뒤" in word and preds_list[i][j-1] in ['TIM-B','TIM-I']:
                    time[-1] = time[-1]+word
                elif "후" in word and preds_list[i][j-1] in ['TIM-B','TIM-I']:
                    time[-1] = time[-1]+word
                else:
                    continue
                date_time_loc[i].append(word)
                

In [37]:
preds_list

[['O', 'O'],
 [],
 ['NUM-B'],
 [],
 ['CVL-B', 'O', 'O', 'O', 'O'],
 [],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'NUM-B',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'DAT-B', 'DAT-I', 'O', 'O', 'O', 'O'],
 [],
 ['DAT-B', 'TIM-B', 'DAT-B', 'O', 'O', 'O', 'O'],
 [],
 ['O', 'DAT-B', 'TIM-B', 'TIM-I', 'O'],
 [],
 ['TIM-B', 'O'],
 [],
 ['DAT-B', 'O', 'TIM-B', 'O', 'O'],
 [],
 ['O', 'O', 'O', 'O'],
 [],
 ['O', 'O', 'DAT-B', 'O', 'O'],
 [],
 ['O', 'O'],
 [],
 ['O', 'O', 'O', 'DAT-B', 'TIM-B', 'O'],
 [],
 ['O'],
 [],
 ['O', 'O', 'DAT-B', 'DAT-B', 'O', 'O', 'DAT-B', 'DAT-B', 'TIM-B', 'O'],
 [],
 ['TIM-B', 'O', 'TIM-B', 'TIM-I', 'O'],
 [],
 ['O'],
 [],
 ['O', 'O', 'DAT-B', 'O']]

In [38]:
now = datetime.now()
year = now.year
month = now.month
day = now.day
date_fix = 0
hour = now.hour
hour_sub = 0   # 없으면 0 오전 1 오후 2
hour_back = 0
hour_flag = 0
minute = 0
minute_sub = ""
weekday = now.weekday()
next_week = 0
next_day = 0
isWeekday= 0

In [39]:
# time calculate
for t in time:
    if "오전" in t:
        hour_sub = 1
    elif "오후" in t:
        hour_sub = 2
    elif "뒤" in t:
        hour_back = re.sub(r'[^0-9]', '', t)
    elif "후" in t:
        hour_back = re.sub(r'[^0-9]', '', t)
    elif "시" in t:
        if re.search('\d',t):
            hour = re.sub(r'[^0-9]', '', t)  
        elif "한시" in t:
            hour = 1
            hour_flag = 1
        elif "두시" in t :
            hour = 2
            hour_flag = 1
        elif "세시" in t :
            hour = 3
            hour_flag = 1
        elif "네시" in t:
            hour = 4
            hour_flag = 1
        elif "다섯시" in t :
            hour = 5
            hour_flag = 1
        elif "여섯시" in t:
            hour = 6
            hour_flag = 1
        elif "일곱시" in t :
            hour = 7
            hour_flag = 1
        elif "여덜시" in t :
            hour = 8
        elif "아홉시" in t:
            hour = 9
        elif "열시" in t:
            hour = 10
        elif "열한시" in t:
            hour = 11
        elif "열두시" in t:
            hour = 12
    elif "분" in t:
        minute = re.sub(r'[^0-9]', '', t)
    elif "반" in t:
        minute = 30


In [40]:
# 토큰화해서 조사인지 확인하고 조사라면 떼어내기
# 근데 이게 경기도 같은거에서 도가 빠지는지 확인.....
# for i in range(20):
#     print(tokenizer.tokenize(lines[51][i]))

In [41]:
for d in date:
    if "다음주" in d or "다음 주" in d:
        next_week = 1
    elif "이번주" in d:
        next_week = 0
    elif "내일" in d and next_day == 0:
        day += 1
        weekday += 1
        next_day = 1
    elif "오늘" in d:
        day = now.day
        weekday = now.weekday()
    elif "월요일" in d:
        promise_week = 0
        isWeekday = 1
    elif "화요일" in d:
        promise_week = 1
        isWeekday = 1
    elif "수요일" in d :
        promise_week = 2
        isWeekday = 1
    elif "목요일" in d: 
        promise_week = 3
        isWeekday = 1
    elif "금요일" in d:
        promise_week = 4
        isWeekday = 1
    elif "토요일" in d:
        promise_week = 5
        isWeekday = 1
    elif "일요일" in d:
        promise_week = 5
        isWeekday = 1
    elif "일" in d and re.search('\d',d):
        day = re.sub(r'[^0-9]', '', d)
        date_fix = 1
    elif "월" in d and re.search('\d',d):
        month = re.sub(r'[^0-9]', '', d)
    elif "년" in d and re.search('\d',d):
        year = re.sub(r'[^0-9]', '', d)
        if year < 2023:
            year = 2023

#대화에 요일이 있을 때
if isWeekday==1:
    if weekday > promise_week or next_week == 1:
        day = int(now.day) + promise_week - weekday + 7
    else:
        day = int(now.day) + promise_week - weekday
    weekday = promise_week

In [42]:
#first확률에서 약속장소가 없을경우 두번때 확률까지 탐색
second_loc = []
if loc == []:
    date_time_loc = {}
    for i,wp in enumerate(zip(lines, for_loc_list)):
            date_time_loc[i] = []
            for j,(word, p) in enumerate(zip(wp[0], wp[1])):
                if p in ['DAT-B','DAT-I','TIM-B','TIM-I'] or ((p == 'NUM-B'or p == 'NUM-I')and "시" in word) or ("반" in word and preds_list[i][j-1] in ['TIM-B','TIM-I']):
                    date_time_loc[i].append(word)
                    continue
                elif p == 'LOC-B':
                    if for_loc_list[i][j-1] in ['ORG-B','LOC-B'] and second_loc != []:
                        second_loc[-1] = second_loc[-1] + " "+ word
                    else:
                        second_loc.append(word)
                elif p == 'LOC-I':
                    second_loc[-1] = second_loc[-1] + " "+ word
                elif p == 'ORG-B':
                    if for_loc_list[i][j-1] in ['ORG-B','LOC-B'] and loc != []:
                        second_loc[-1] = second_loc[-1] + " "+ word
                    else:
                        second_loc.append(word)
                elif p == 'ORG-I':
                    second_loc[-1] = second_loc[-1] + " "+ word
                else:
                    continue
                date_time_loc[i].append(word)
                

In [43]:
time

['오전이나', '오전', '11시', '11시', '11시에', '11시에', '3시는', '3시', '1시간뒤는']

In [44]:

if day == now.day:
    if int(hour) < now.hour:
        hour = int(hour) + 12
if next_day == 1 :
    if 0 < int(hour) < 6:
        hour = int(hour) + 12
weekday = weekday % 7

In [45]:
max_i = -1
max = -1
for i in range(len(date_time_loc)):
    if max <= len(date_time_loc[i]):
        max = len(date_time_loc[i])
        max_i = i

#location processing
ignore = ['에서','라는']
if loc == []:
    location = "미정"
    for i in range(len(for_loc_list[max_i])):
        if for_loc_list[max_i][i] in ['ORG-B','LOC-B']:
            location = lines[max_i][i]
else:
    location = loc[-1]
for s in ignore:
    location = re.sub(s,"",location)



In [46]:
print("주요 통화 내용 : {} ".format(' '.join(s for s in lines[max_i])))

주요 통화 내용 : 아아 저 다음주 화요일은 안될것같고 혹시 이번주 금요일 3시는 어떠신가요 


In [47]:
print("약속 장소 : {} , 약속 시간 : {}년 {}월 {}일  {}시 {}분 ({})".format(location,year,month,day,hour,minute,WEEKDAY[weekday]))

약속 장소 : 미정 , 약속 시간 : 2023년 2월 10일  3시 0분 (금요일)


In [52]:
date_time_loc

{0: [],
 1: [],
 2: [],
 3: [],
 4: [],
 5: [],
 6: [],
 7: ['다음', '주'],
 8: [],
 9: ['화요일', '오전이나', '수요일', '중이'],
 10: [],
 11: ['화수', '화요일', '오전', '11시'],
 12: [],
 13: ['11시'],
 14: [],
 15: ['오는', '11시에'],
 16: [],
 17: [],
 18: [],
 19: ['26일까지'],
 20: [],
 21: ['네네네'],
 22: [],
 23: ['화요일날', '11시에'],
 24: [],
 25: [],
 26: [],
 27: ['다음주', '화요일은', '이번주', '금요일', '3시는'],
 28: [],
 29: ['3시', '1시간', '뒤는'],
 30: [],
 31: [],
 32: [],
 33: ['금요일에']}

In [49]:
lines[12]

[]

In [50]:
location = "스타벅스가에서"

location

'스타벅스가에서'

In [51]:
location

'스타벅스가에서'

In [None]:
import re
import datetime

# Extract time information using regular expressions
def extract_time(text):
    time_pattern = re.compile(r'\d{1,2}:\d{1,2}')
    match = re.search(time_pattern, text)
    if match:
        time_str = match.group()
        hour, minute = map(int, time_str.split(':'))
        return datetime.time(hour, minute)
    return None

# Calculate the time difference between the current time and appointment time
def calculate_time_difference(appointment_time):
    now = datetime.datetime.now().time()
    appointment_datetime = datetime.datetime.combine(datetime.datetime.today(), appointment_time)
    difference = appointment_datetime - datetime.datetime.now()
    return difference

# Example usage
text = "My appointment is at 10:30 on tuesday."
appointment_time = extract_time(text)
if appointment_time:
    time_difference = calculate_time_difference(appointment_time)
    print(f"The appointment is in {time_difference}.")
else:
    print("Unable to extract appointment time.")