In [1]:
import pandas as pd
from doccano_transformer.datasets import NERDataset
from doccano_transformer.utils import read_jsonl

def convert_to_conll2003(input_file, output_file, tokenizer=str.split, encoding='latin-1'):
    """
    Converts a JSONL file to the CoNLL-2003 format and saves it to an output file.
    """
    with open(output_file, "w", encoding="utf-8") as file:
        for entry in read_jsonl(filepath=input_file, dataset=NERDataset, encoding=encoding).to_conll2003(tokenizer=tokenizer):
            file.write(entry["data"] + "\n")

def parse_conll2003(data):
    """
    Parses CoNLL-2003 formatted data into a list of sentences with token and tag tuples.
    """
    sentences = []
    sentence = []
    for line in data.splitlines():
        if line.startswith("-DOCSTART-") or line == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            parts = line.split()
            if len(parts) == 4:
                word, _, _, tag = parts
                sentence.append((word, tag))
    if sentence:
        sentences.append(sentence)
    return sentences

def read_and_parse_conll_file(filename, encoding='utf-8'):
    """
    Reads a CoNLL-2003 formatted file and parses it into structured data.
    """
    with open(filename, 'r', encoding=encoding) as file:
        data = file.read()
    return parse_conll2003(data)

def convert_to_dataframe(sentences):
    """
    Converts parsed sentences into a pandas DataFrame.
    """
    data = {
        'tokens': [],
        'ner_tags': []
    }

    for sentence in sentences:
        tokens = [word for word, tag in sentence]
        ner_tags = [tag for word, tag in sentence]
        data['tokens'].append(tokens)
        data['ner_tags'].append(ner_tags)

    return pd.DataFrame(data)

def main(input_jsonl, output_conll, output_csv, tokenizer=str.split, jsonl_encoding='latin-1', conll_encoding='utf-8'):
    """
    Main function to convert JSONL to CoNLL-2003, parse it, and save as CSV.
    """
    # Convert JSONL to CoNLL-2003
    convert_to_conll2003(input_jsonl, output_conll, tokenizer, jsonl_encoding)
    
    # Parse the CoNLL-2003 file
    sentences = read_and_parse_conll_file(output_conll, conll_encoding)
    
    # Convert parsed data to DataFrame
    df = convert_to_dataframe(sentences)
    
    # Optionally, save the DataFrame to a CSV file
    df.to_csv(output_csv, index=False)

if __name__ == "__main__":
    input_jsonl = 'admin.jsonl'
    output_conll = 'datasets/test.dataset'
    output_csv = 'parsed_data.csv'
    
    main(input_jsonl, output_conll, output_csv)