In [11]:
import os
import glob
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
def read_conll(in_file, lowercase=False, max_example=None):
    examples = []
    with open(in_file) as f:
        word, label = [], []
        for line in f.readlines():
            sp = line.strip().split(' ')
            if len(sp) == 4:
                if '-' not in sp[0]:
                    word.append(sp[0]) #.lower() if lowercase else sp[1])
                    label.append(sp[3])
            elif len(word) > 0:
                examples.append({'tokens': word, 'spans': label})
                word, label = [], []
                if (max_example is not None) and (len(examples) == max_example):
                    break
        if len(word) > 0:
            examples.append({'tokens': word, 'spans': label})
    df = pd.DataFrame(examples)
    return df

def get_entities(seq):
    """Gets entities from sequence.
    note: BIO
    Args:
        seq (list): sequence of labels.
    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).
    Example:
        seq = ['B-PER', 'I-PER', 'O', 'B-LOC', 'I-PER']
        get_entity_bio(seq)
        #output
        [['PER', 0,1], ['LOC', 3, 3], ['PER', 4, 4]]
    """
    if any(isinstance(s, list) for s in seq):
        seq = [item for sublist in seq for item in sublist + ['O']]

    prev_tag = 'O'
    prev_type = ''
    begin_offset = 0
    chunks = []
    for i, chunk in enumerate(seq + ['O']):
        tag = chunk[0]
        type_ = chunk.split('-')[-1]

        if end_of_chunk(prev_tag, tag, prev_type, type_):
            chunks.append((prev_type, begin_offset, i - 1))
        if start_of_chunk(prev_tag, tag, prev_type, type_):
            begin_offset = i
        prev_tag = tag
        prev_type = type_

    return set(chunks)


def end_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk ended between the previous and current word.

    Args:
        prev_tag: previous chunk tag.
        tag: current chunk tag.
        prev_type: previous type.
        type_: current type.

    Returns:
        chunk_end: boolean.
    """
    chunk_end = False

    if prev_tag == 'E': chunk_end = True
    # if prev_tag == 'S': chunk_end = True

    if prev_tag == 'B' and tag == 'B': chunk_end = True
    # if prev_tag == 'B' and tag == 'S': chunk_end = True
    if prev_tag == 'B' and tag == 'O': chunk_end = True
    if prev_tag == 'I' and tag == 'B': chunk_end = True
    # if prev_tag == 'I' and tag == 'S': chunk_end = True
    if prev_tag == 'I' and tag == 'O': chunk_end = True

    if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
        chunk_end = True

    return chunk_end


def start_of_chunk(prev_tag, tag, prev_type, type_):
    """Checks if a chunk started between the previous and current word.

    Args:
        prev_tag: previous chunk tag.
        tag: current chunk tag.
        prev_type: previous type.
        type_: current type.

    Returns:
        chunk_start: boolean.
    """
    chunk_start = False

    if tag == 'B': chunk_start = True
    if tag == 'S': chunk_start = True

    if prev_tag == 'E' and tag == 'E': chunk_start = True
    if prev_tag == 'E' and tag == 'I': chunk_start = True
    # if prev_tag == 'S' and tag == 'E': chunk_start = True
    # if prev_tag == 'S' and tag == 'I': chunk_start = True
    if prev_tag == 'O' and tag == 'E': chunk_start = True
    if prev_tag == 'O' and tag == 'I': chunk_start = True

    if tag != 'O' and tag != '.' and prev_type != type_:
        chunk_start = True

    return chunk_start

In [27]:
def template_format(data_path):
    df = read_conll(data_path)
    df['Source sentence'] = [' '.join(x).strip() for x in df['tokens']]
    df['entities'] = [get_entities(x) for x in df['spans']]
    df['len'] = [len(x) for x in df.entities]
    df = df[df['len'] != 0]
    vowels = ('a','e','i','o','u','A','E','I','O','U')
    df['Answer sentence'] = pd.Series(dtype='object')
    for i in range(len(df)):
        li = []
        for x in df.entities.iloc[i]:
            if x[0] == 'O':
                li.append(' '.join(df.tokens.iloc[i][x[1]:x[2]+1]) + ' is not a named entity')
            elif x[0].startswith(vowels):
                li.append(' '.join(df.tokens.iloc[i][x[1]:x[2]+1]) + ' is an ' + x[0] + ' entity')
            else:
                li.append(' '.join(df.tokens.iloc[i][x[1]:x[2]+1]) + ' is a ' + x[0] + ' entity')
        df['Answer sentence'].iloc[i] = li
    return df[['Source sentence','Answer sentence']].explode('Answer sentence')

In [29]:
path = r'/home/tranthh/semeval2023/train_dev' # use your path
all_files = glob.glob(os.path.join(path , "*.conll"))

li = []

for filename in all_files:
    df = template_format(filename)
    name = filename.split('/')[-1].split('.')[0]
    print(name, len(df))
    df.to_csv('/home/tranthh/semeval2023/templated_data/'+ name+'.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


de-dev 820
fa-dev 1230
sv-train 25175
uk-dev 1092
sv-dev 1379
es-train 23758
it-train 26302
en-dev 1284
bn-train 13141
pt-dev 1276
pt-train 24207
zh-dev 770
fr-train 25743
es-dev 1219
de-train 15591
hi-train 12783
fr-dev 1317
uk-train 21321
bn-dev 672
fa-train 23606
it-dev 1389
en-train 25224
hi-dev 676
zh-train 15203
