# ann 파일을 conll 포맷으로 변환하기

191015

BMES 포맷(or BIO)

[@AlphaBay Market#Company*] 와 같이 [@~#~*] 식으로 표현되있다

입력 문장을 너으면, conll 포맷으로 변형시켜 list로 출력해준다

In [1]:
import os, re
import nltk
from nltk import word_tokenize, pos_tag

In [2]:
# filename = 'test.ann'
# sentences = []
# with open(filename, 'r', encoding='UTF-8') as f:
#     for line in f.readlines():
#         line = line.strip()
#         sentences.append(line)
        
# sentences[:3]

sentences = ['[@July 06, 2017 01:49:00 PM#Time_Published*]',
             'A Dark Web Marketplace is Down and Users Suspect Foul Play',
             '[@AlphaBay Market#Company*], a popular darknet marketplace has been offline since Tuesday night sparking concerns from users that the site’s operators have [@stolen customer account funds#Attack_Objective*] and disappeared.']
sentences

['[@July 06, 2017 01:49:00 PM#Time_Published*]',
 'A Dark Web Marketplace is Down and Users Suspect Foul Play',
 '[@AlphaBay Market#Company*], a popular darknet marketplace has been offline since Tuesday night sparking concerns from users that the site’s operators have [@stolen customer account funds#Attack_Objective*] and disappeared.']

In [3]:
## function
# input  : sentence with tag
# output : conll format
#     'Evolution S-Company\n',
#     'marketplace O\n',
#     'in O\n',
#     '2015 O\n',

def make_ann2conll(tagedSentence, tagScheme="BMES", seget_nltk_tokenize=True, entityRe=r'\[[\@\$)].*?\#.*?\*\](?!\#)'):
    ## input  : sentence
    ## output : pairList
#         ['AlphaBay B-Company\n',
#          'Market I-Company\n',
#          ', O\n',
#          'a O\n',

    newSent = tagedSentence.strip('\n')  # \n 떼고

    # filterList : ['[@AlphaBay Market#Company*]', '[@stolen customer account funds#Attack_Objective*]']
    filterList = re.findall(entityRe, newSent)  # '[@~*]' 찾고
    newSentLength = len(newSent)  # 문장 길이 구하고
    chunk_list = []
    start_pos = 0
    end_pos = 0

    # annotation이 없는 문장이면
    if len(filterList) == 0:
        singleChunkList = []
        singleChunkList.append(newSent)
        singleChunkList.append(0)
        singleChunkList.append(len(newSent))
        singleChunkList.append(False)
        # chunk_list : [문장, 0, 문장길이, False]
        chunk_list.append(singleChunkList)
        # 초기화
        singleChunkList = []

    ## annotation이 있는 문장이면
    # filterList : ['[@AlphaBay Market#Company*]', '[@stolen customer account funds#Attack_Objective*]']
    else:
        for pattern in filterList:
            # print pattern
            singleChunkList = []
            start_pos = end_pos + newSent[end_pos:].find(pattern)
            end_pos = start_pos + len(pattern)
            singleChunkList.append(pattern)
            singleChunkList.append(start_pos)
            singleChunkList.append(end_pos)
            singleChunkList.append(True)
            # chunk_list : [패턴, 패턴 시작위치, 패턴 끝위치, True]
            # [['[@AlphaBay Market#Company*]', 0, 27, True],
            # ['[@stolen customer account funds#Attack_Objective*]', 156, 206, True]]
            chunk_list.append(singleChunkList)
            singleChunkList = []

    ## chunk_list format:
    # full_list 형태, 순서대로 이어붙이면 된다
    # [['[@AlphaBay Market#Company*]', 0, 27, True],
    #  [', a popular darknet marketplace has been offline since Tuesday night sparking concerns from users that the site’s operators have ',
    #   27,
    #   156,
    #   False],
    #  ['[@stolen customer account funds#Attack_Objective*]', 156, 206, True],
    #  [' and disappeared.', 206, 223, False]]

    full_list = []
    for idx in range(0, len(chunk_list)):
        if idx == 0:
            if chunk_list[idx][1] > 0:
                full_list.append([newSent[0:chunk_list[idx][1]], 0, chunk_list[idx][1], False])
                full_list.append(chunk_list[idx])
            else:
                full_list.append(chunk_list[idx])

        # annotation이 있으면
        else:
            if chunk_list[idx][1] == chunk_list[idx-1][2]:
                full_list.append(chunk_list[idx])
            elif chunk_list[idx][1] < chunk_list[idx-1][2]:
                print("ERROR: found pattern has overlap!", chunk_list[idx][1], ' with ', chunk_list[idx-1][2])
            else:
                full_list.append([newSent[chunk_list[idx-1][2]:chunk_list[idx][1]], chunk_list[idx-1][2], chunk_list[idx][1], False])
                full_list.append(chunk_list[idx])

        if idx == len(chunk_list) - 1 :
            if chunk_list[idx][2] > newSentLength:
                print("ERROR: found pattern position larger than sentence length!")
            elif chunk_list[idx][2] < newSentLength:
                full_list.append([newSent[chunk_list[idx][2]:newSentLength], chunk_list[idx][2], newSentLength, False])
            else:
                continue

    #######################################################################            
    #######################################################################
    pairList = []
    for eachList in full_list:
        # eachList : [@AlphaBay Market#Company*]', 0, 27, True]
        # eachList[3] : True or False, 어노테이션이냐 아니냐

        # 1. 어노테이션일 때
        if eachList[3]:
            # 쪼개고
            # [@AlphaBay Market#Company*] -> ['AlphaBay Market', 'Company*']
            contLabelList = eachList[0].strip('[@$]').rsplit('#', 1)

            # 쪼갠 길이가 2가 아니면 에러 표시
            if len(contLabelList) != 2:
                print("Error: sentence format error!")
            # 'Company*' 에서 *를 뗴고 label 선언
            # label : Company
            label = contLabelList[1].strip('*')

            # 두 단어 이상이 annotation 되있으면 쪼갠다, 'AlphaBay Market' -> ['AlphaBay', 'Market']
            # nltk word tokenize를 이용하여 쪼갠다
            if seget_nltk_tokenize:
                contLabelList[0] = word_tokenize(contLabelList[0])
            # 그냥 빈칸 단위로 쪼갠다
            else:
                contLabelList[0] = contLabelList[0].split()

                
            ## BIO, BMES 관련 태깅함수
            outList = outputWithTagScheme(contLabelList[0], label, tagScheme)

            # pairList : ['AlphaBay B-Company\n', 'Market I-Company\n']
            for eachItem in outList:
                pairList.append(eachItem)

        # 2. 어노테이션일 아닐때
        else:
            # nltk word tokenize를 이용하여 쪼갠다
            if seget_nltk_tokenize:
                eachList[0] = word_tokenize(eachList[0])
            # 그냥 빈칸 단위로 쪼갠다
            else:
                eachList[0] = eachList[0].split()        

            for idx in range(0, len(eachList[0])):
                basicContent = eachList[0][idx]

                # 빈칸이면 마킹 안하고 패스
                if basicContent == ' ':  continue
                # 아무것도 아니니까 O 을 단다
                pair = basicContent + ' ' + 'O\n'
                pairList.append(pair)
                
    return pairList


## 태그 달기
def outputWithTagScheme(input_list, label, tagScheme="BMES"):
    output_list = []
    list_length = len(input_list)
    if tagScheme=="BMES":
        if list_length ==1:
            pair = input_list[0]+ ' ' + 'S-' + label + '\n'
            output_list.append(pair)
        else:
            for idx in range(list_length):
                if idx == 0:
                    pair = input_list[idx]+ ' ' + 'B-' + label + '\n'
                elif idx == list_length -1:
                    pair = input_list[idx]+ ' ' + 'E-' + label + '\n'
                else:
                    pair = input_list[idx]+ ' ' + 'M-' + label + '\n'
                output_list.append(pair)
    else:
        for idx in range(list_length):
            if idx == 0:
                pair = input_list[idx]+ ' ' + 'B-' + label + '\n'
            else:
                pair = input_list[idx]+ ' ' + 'I-' + label + '\n'
#             output_list.append(pair.encode('utf-8'))
            output_list.append(pair)
    return output_list

In [4]:
line = sentences[2]

conll_format = make_ann2conll(line)

print(line)
conll_format

[@AlphaBay Market#Company*], a popular darknet marketplace has been offline since Tuesday night sparking concerns from users that the site’s operators have [@stolen customer account funds#Attack_Objective*] and disappeared.


['AlphaBay B-Company\n',
 'Market E-Company\n',
 ', O\n',
 'a O\n',
 'popular O\n',
 'darknet O\n',
 'marketplace O\n',
 'has O\n',
 'been O\n',
 'offline O\n',
 'since O\n',
 'Tuesday O\n',
 'night O\n',
 'sparking O\n',
 'concerns O\n',
 'from O\n',
 'users O\n',
 'that O\n',
 'the O\n',
 'site O\n',
 '’ O\n',
 's O\n',
 'operators O\n',
 'have O\n',
 'stolen B-Attack_Objective\n',
 'customer M-Attack_Objective\n',
 'account M-Attack_Objective\n',
 'funds E-Attack_Objective\n',
 'and O\n',
 'disappeared O\n',
 '. O\n']