In [1]:
import pandas as pd
import re
import os

def read_ann_file_content(ann_file_path):
    ids, entity_types, positions, entity_texts = [], [], [], []
    with open(ann_file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                id_, entity_info, entity_text = parts
                entity_type, position = entity_info.split(' ', 1)
                ids.append(id_)
                entity_types.append(entity_type)
                positions.append(position)
                entity_texts.append(entity_text)
    return pd.DataFrame({
        'id': ids,
        'entity_type': entity_types,
        'position': positions,
        'entity_text': entity_texts
    })

In [2]:
# Paths setup
input_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev/'
for filename in os.listdir(input_folder_path):
    if filename.endswith('.ann'):
        base_filename = os.path.splitext(filename)[0]
        ann_file_path = os.path.join(input_folder_path, filename)
        df = read_ann_file_content(ann_file_path)
        #print(df)
        # Find duplicate rows
        duplicates = df[df.duplicated(subset=['entity_type', 'position', 'entity_text'], keep=False)]
        
        if not duplicates.empty:
            print(f"Duplicates in file {filename}:")
            print(duplicates)

Duplicates in file 26145744_en.ann:
     id entity_type position entity_text
24  T51     ANATOMY  180 185       sinus
30  T62     ANATOMY  377 382       sinus
76  T73     ANATOMY  377 382       sinus
77  T83     ANATOMY  180 185       sinus


In [4]:
#Can not handle disjoint cases properly

def parse_position(position_str):
    return [tuple(map(int, pos.split())) for pos in position_str.split(';')]

def tokenize_with_positions_and_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    tokens_with_positions_and_sentences = []
    current_sentence_id = 0
    current_position = 0
    for sentence in sentences:
        tokens = re.findall(r'\d+\.\d+|[^\w\s]|\w+', sentence)
        for token in tokens:
            start_index = text.find(token, current_position)
            end_index = start_index + len(token)
            tokens_with_positions_and_sentences.append((token, start_index, end_index, current_sentence_id))
            current_position = end_index
        current_sentence_id += 1
    return tokens_with_positions_and_sentences

def data_annotation(ann_file_path, txt_file_path, destination_folder_path):
    annotations = read_ann_file_content(ann_file_path)
    annotations['parsed_position'] = annotations['position'].apply(parse_position)
    annotations = annotations.explode('parsed_position').reset_index(drop=True)
    annotations[['start', 'end']] = pd.DataFrame(annotations['parsed_position'].tolist(), index=annotations.index)
    annotations['length'] = annotations['end'] - annotations['start']
    annotations.sort_values(by=['length', 'start'], ascending=[False, True], inplace=True)

    with open(txt_file_path, 'r') as file:
        text = file.read()

    tokens_with_positions_and_sentences = tokenize_with_positions_and_sentences(text)
    df_tokens = pd.DataFrame(tokens_with_positions_and_sentences, columns=['Token', 'Start', 'End', 'SentenceID'])
    for i in range(1, 9):
        df_tokens[f'NNER_L{i}'] = 'O'

    for _, annotation in annotations.iterrows():
        entity_type = annotation['entity_type']
        start, end = annotation['start'], annotation['end']
        overlapping_levels = [level for level in range(1, 9) if any((start <= row['Start'] < end) and row[f'NNER_L{level}'] != 'O' for _, row in df_tokens.iterrows())]
        level = min(set(range(1, 9)) - set(overlapping_levels)) if overlapping_levels else 1

        for i, row in df_tokens.iterrows():
            token_start, token_end = row['Start'], row['End']
            if start <= token_start < end:
                tag_prefix = 'B' if token_start == start else 'I'
                df_tokens.at[i, f'NNER_L{level}'] = f'{tag_prefix}-{entity_type}'

    filename_without_extension = os.path.splitext(os.path.basename(txt_file_path))[0]
    df_tokens['ID'] = df_tokens['SentenceID'].apply(lambda x: f'{filename_without_extension}_S{x}')
    df_tokens.drop('SentenceID', axis=1, inplace=True)
    cols = ['ID'] + [col for col in df_tokens.columns if col != 'ID']
    df_tokens = df_tokens[cols]

    unique_ids = df_tokens['ID'].unique()
    for uid in unique_ids:
        df_filtered = df_tokens[df_tokens['ID'] == uid]
        filename = os.path.join(destination_folder_path, f"{uid}.csv")
        df_filtered.to_csv(filename, index=False)
    print(f"Saved {filename}")

# Paths setup
input_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing'
destination_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing'

if not os.path.exists(destination_folder_path):
    os.makedirs(destination_folder_path)

for filename in os.listdir(input_folder_path):
    if filename.endswith('.ann'):
        base_filename = os.path.splitext(filename)[0]
        ann_file_path = os.path.join(input_folder_path, filename)
        txt_file_path = os.path.join(input_folder_path, base_filename + '.txt')
        
        if os.path.exists(txt_file_path):
            data_annotation(ann_file_path, txt_file_path, destination_folder_path)
        else:
            print(f"Text file for {filename} not found.")


Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing/26281196_en_S7.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing/26845866_en_S13.csv


In [None]:
#################### Works good  ############################

'''import os
import re
import pandas as pd

def parse_position(position_str):
    return [tuple(map(int, pos.split())) for pos in position_str.split(';')]

def tokenize_with_positions_and_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    tokens_with_positions_and_sentences = []
    current_sentence_id = 0
    current_position = 0
    for sentence in sentences:
        tokens = re.findall(r'\d+\.\d+|[^\w\s]|\w+', sentence)
        for token in tokens:
            start_index = text.find(token, current_position)
            end_index = start_index + len(token)
            tokens_with_positions_and_sentences.append((token, start_index, end_index, current_sentence_id))
            current_position = end_index
        current_sentence_id += 1
    return tokens_with_positions_and_sentences

def data_annotation(ann_file_path, txt_file_path, destination_folder_path):
    def read_ann_file_content(ann_file_path):
        annotations = []
        with open(ann_file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                entity_info = parts[1].split(' ')
                entity_type = entity_info[0]
                positions = ' '.join(entity_info[1:])
                entity_text = parts[2]
                annotations.append({'entity_type': entity_type, 'position': positions, 'text': entity_text})
        return pd.DataFrame(annotations)

    annotations = read_ann_file_content(ann_file_path)
    annotations['parsed_position'] = annotations['position'].apply(parse_position)
    annotations['start'] = annotations['parsed_position'].apply(lambda x: x[0][0])
    annotations['end'] = annotations['parsed_position'].apply(lambda x: x[-1][1])
    annotations['length'] = annotations['end'] - annotations['start']
    annotations.sort_values(by=['length', 'start'], ascending=[False, True], inplace=True)

    with open(txt_file_path, 'r') as file:
        text = file.read()

    tokens_with_positions_and_sentences = tokenize_with_positions_and_sentences(text)
    df_tokens = pd.DataFrame(tokens_with_positions_and_sentences, columns=['Token', 'Start', 'End', 'SentenceID'])
    for i in range(1, 9):
        df_tokens[f'NNER_L{i}'] = 'O'

    for _, annotation in annotations.iterrows():
        entity_type = annotation['entity_type']
        entity_positions = annotation['parsed_position']
        is_first_segment = True

        for start, end in entity_positions:
            overlapping_levels = [level for level in range(1, 9) if any((start <= row['Start'] < end) and row[f'NNER_L{level}'] != 'O' for _, row in df_tokens.iterrows())]
            level = min(set(range(1, 9)) - set(overlapping_levels)) if overlapping_levels else 1

            for i, row in df_tokens.iterrows():
                token_start, token_end = row['Start'], row['End']
                if start <= token_start < end:
                    tag_prefix = 'B' if is_first_segment and token_start == start else 'I'
                    df_tokens.at[i, f'NNER_L{level}'] = f'{tag_prefix}-{entity_type}'
            is_first_segment = False

    filename_without_extension = os.path.splitext(os.path.basename(txt_file_path))[0]
    df_tokens['ID'] = df_tokens['SentenceID'].apply(lambda x: f'{filename_without_extension}_S{x}')
    df_tokens.drop('SentenceID', axis=1, inplace=True)
    cols = ['ID'] + [col for col in df_tokens.columns if col != 'ID']
    df_tokens = df_tokens[cols]

    unique_ids = df_tokens['ID'].unique()
    for uid in unique_ids:
        df_filtered = df_tokens[df_tokens['ID'] == uid]
        filename = os.path.join(destination_folder_path, f"{uid}.csv")
        df_filtered.to_csv(filename, index=False)
    print(f"Saved {filename}")

# Paths setup
input_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing'
destination_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing'

if not os.path.exists(destination_folder_path):
    os.makedirs(destination_folder_path)

for filename in os.listdir(input_folder_path):
    if filename.endswith('.ann'):
        base_filename = os.path.splitext(filename)[0]
        ann_file_path = os.path.join(input_folder_path, filename)
        txt_file_path = os.path.join(input_folder_path, base_filename + '.txt')
        
        if os.path.exists(txt_file_path):
            data_annotation(ann_file_path, txt_file_path, destination_folder_path)
        else:
            print(f"Text file for {filename} not found.")
'''

In [None]:
print("\n\n\n *********************  Process Complete ********************\n\n\n")

In [None]:
'''import pandas as pd
import re
import os

def read_ann_file_content(ann_file_path):
    ids, entity_types, positions, entity_texts = [], [], [], []
    with open(ann_file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                id_, entity_info, entity_text = parts
                entity_type, position = entity_info.split(' ', 1)
                ids.append(id_)
                entity_types.append(entity_type)
                positions.append(position)
                entity_texts.append(entity_text)
    return pd.DataFrame({
        'id': ids,
        'entity_type': entity_types,
        'position': positions,
        'entity_text': entity_texts
    })

def parse_position(position_str):
    return [tuple(map(int, pos.split())) for pos in position_str.split(';')]

def tokenize_with_positions_and_sentences(text):
    # Detect sentences (simple approach, can be improved with NLP tools)
    sentences = re.split(r'(?<=[.!?]) +', text)
    tokens_with_positions_and_sentences = []
    current_sentence_id = 0
    current_position = 0
    for sentence in sentences:
        tokens = re.findall(r'\d+\.\d+|[^\w\s]|\w+', sentence)
        for token in tokens:
            start_index = text.find(token, current_position)
            end_index = start_index + len(token)
            tokens_with_positions_and_sentences.append((token, start_index, end_index, current_sentence_id))
            current_position = end_index
        current_sentence_id += 1
    return tokens_with_positions_and_sentences

def data_annotation(ann_file_path, txt_file_path, destination_folder_path):
    annotations = read_ann_file_content(ann_file_path)
    annotations['parsed_position'] = annotations['position'].apply(parse_position)
    annotations = annotations.explode('parsed_position').reset_index(drop=True)
    annotations[['start', 'end']] = pd.DataFrame(annotations['parsed_position'].tolist(), index=annotations.index)
    annotations['length'] = annotations['end'] - annotations['start']
    annotations.sort_values(by=['start', 'end'], inplace=True)

    with open(txt_file_path, 'r') as file:
        text = file.read()

    tokens_with_positions_and_sentences = tokenize_with_positions_and_sentences(text)
    df_tokens = pd.DataFrame(tokens_with_positions_and_sentences, columns=['Token', 'Start', 'End', 'SentenceID'])
    for i in range(1, 9):
        df_tokens[f'NNER_L{i}'] = 'O'

    for _, annotation in annotations.iterrows():
        entity_type = annotation['entity_type']
        start, end = annotation['start'], annotation['end']
        overlapping_levels = [level for level in range(1, 9) if any((start <= row['Start'] < end) for _, row in df_tokens.iterrows() if row[f'NNER_L{level}'] != 'O')]
        level = min(set(range(1, 9)) - set(overlapping_levels)) if overlapping_levels else 1

        for i, row in df_tokens.iterrows():
            token_start, token_end = row['Start'], row['End']
            if start <= token_start < end:
                tag_prefix = 'B' if token_start == start else 'I'
                df_tokens.at[i, f'NNER_L{level}'] = f'{tag_prefix}-{entity_type}'

    filename_without_extension = os.path.splitext(os.path.basename(txt_file_path))[0]
    df_tokens['ID'] = df_tokens['SentenceID'].apply(lambda x: f'{filename_without_extension}_S{x}')
    df_tokens.drop('SentenceID', axis=1, inplace=True)
    cols = ['ID'] + [col for col in df_tokens.columns if col != 'ID']
    df_tokens = df_tokens[cols]

    unique_ids = df_tokens['ID'].unique()
    for uid in unique_ids:
        df_filtered = df_tokens[df_tokens['ID'] == uid]
        filename = os.path.join(destination_folder_path, f"{uid}.csv")
        df_filtered.to_csv(filename, index=False)
    print(f"Saved {filename}")

# Paths setup
input_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing'
destination_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/annotation testing'

if not os.path.exists(destination_folder_path):
    os.makedirs(destination_folder_path)

for filename in os.listdir(input_folder_path):
    if filename.endswith('.ann'):
        base_filename = os.path.splitext(filename)[0]
        ann_file_path = os.path.join(input_folder_path, filename)
        txt_file_path = os.path.join(input_folder_path, base_filename + '.txt')
        
        if os.path.exists(txt_file_path):
            data_annotation(ann_file_path, txt_file_path, destination_folder_path)
        else:
            print(f"Text file for {filename} not found.")
'''

In [5]:

import os
import pandas as pd

# Paths setup
input_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev/'

destination_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4'

if not os.path.exists(destination_folder_path):
    os.makedirs(destination_folder_path)

for filename in os.listdir(input_folder_path):
    if filename.endswith('.ann'):
        base_filename = os.path.splitext(filename)[0]
        ann_file_path = os.path.join(input_folder_path, filename)
        txt_file_path = os.path.join(input_folder_path, base_filename + '.txt')
        
        if os.path.exists(txt_file_path):
            data_annotation(ann_file_path, txt_file_path, destination_folder_path)
        else:
            print(f"Text file for {filename} not found.")

Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26489117_en_S14.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/27030325_en_S10.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26525628_en_S6.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26356060_en_S12.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26977908_en_S6.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26525809_en_S5.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26485774_en_S12.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26978233_en_S8.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/26356398_en_S13.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONN

In [6]:
import os
import pandas as pd

# Paths setup
input_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train/'

destination_folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4'

if not os.path.exists(destination_folder_path):
    os.makedirs(destination_folder_path)

for filename in os.listdir(input_folder_path):
    if filename.endswith('.ann'):
        base_filename = os.path.splitext(filename)[0]
        ann_file_path = os.path.join(input_folder_path, filename)
        txt_file_path = os.path.join(input_folder_path, base_filename + '.txt')
        
        if os.path.exists(txt_file_path):
            data_annotation(ann_file_path, txt_file_path, destination_folder_path)
        else:
            print(f"Text file for {filename} not found.")


Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26281196_en_S7.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26591552_en_S10.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26120981_en_S6.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26245096_en_S5.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26356162_en_S9.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26978051_en_S12.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26977625_en_S6.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26356617_en_S6.csv
Saved Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/26355933_en_S6.csv
Saved Dataset/DATASET_BIONNE_UPDATE

In [None]:
import os
import pandas as pd

def find_empty_columns_in_csv(folder_path):
    # Dictionary to store the filename and the number of empty columns
    results = {}

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            
            # Count the number of columns where all entries are "O"
            empty_columns = 0
            for column in df.columns:
                if all(df[column] == 'O'):
                    empty_columns += 1
            
            # Store the result in the dictionary
            results[filename] = empty_columns
    
    # Find the file with the lowest number of empty columns
    min_empty_columns = min(results.values())
    for file, count in results.items():
        if count == min_empty_columns:
            print(f"Filename: {file}, Empty Columns: {count}")

# Specify the path to the folder containing the CSV files
folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/'

# Call the function
find_empty_columns_in_csv(folder_path)


In [None]:
print("Done")

In [12]:
import os
import pandas as pd

def find_files_with_two_empty_columns(folder_path):
    # List to store filenames with exactly two empty columns
    files_with_two_empty_columns = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            
            # Count the number of columns where all entries are "O"
            empty_columns = 0
            for column in df.columns:
                if all(df[column] == 'O'):
                    empty_columns += 1
            
            # Check if the file has exactly two empty columns
            if empty_columns < 4:
                files_with_two_empty_columns.append(filename)
    
    # Print the results
    if files_with_two_empty_columns:
        for file in files_with_two_empty_columns:
            print(f"Filename: {file}, Empty Columns: 3")
    else:
        print("No files with exactly two empty columns found.")

# Specify the path to the folder containing the CSV files
folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/'

# Call the function
find_files_with_two_empty_columns(folder_path)


Filename: 26281196_en_S5.csv, Empty Columns: 3
Filename: 26591552_en_S9.csv, Empty Columns: 3
Filename: 26485778_en_S9.csv, Empty Columns: 3
Filename: 26845866_en_S12.csv, Empty Columns: 3
Filename: 26978635_en_S0.csv, Empty Columns: 3
Filename: 25842921_en_S6.csv, Empty Columns: 3
Filename: 27029445_en_S6.csv, Empty Columns: 3
Filename: 26081319_en_S3.csv, Empty Columns: 3


In [15]:
import os
import pandas as pd

def print_empty_columns_for_each_file(folder_path):
    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            
            # Count the number of columns where all entries are "O"
            empty_columns = 0
            for column in df.columns:
                if all(df[column] == 'O'):
                    empty_columns += 1
            
            # Print the result for the current file
            if empty_columns<=2:
                print(f"Filename: {filename}, Empty Columns: {empty_columns}")

print("Train:")
# Specify the path to the folder containing the CSV files
folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/train_Nested_entity_separated_V4/'

# Call the function
print_empty_columns_for_each_file(folder_path)


print("Dev:")
# Specify the path to the folder containing the CSV files
folder_path = 'Dataset/DATASET_BIONNE_UPDATED/DATASET_BIONNE/en/dev_Nested_entity_separated_V4/'

# Call the function
print_empty_columns_for_each_file(folder_path)


Train:
Dev:
Filename: 27030325_en_S4.csv, Empty Columns: 2
Filename: 27030325_en_S7.csv, Empty Columns: 2
Filename: 26485774_en_S0.csv, Empty Columns: 2
Filename: 26145744_en_S1.csv, Empty Columns: 2
Filename: 26145744_en_S2.csv, Empty Columns: 2


In [None]:
Therefore number of nesting level is 6