In [None]:
from google.colab import drive
from google.colab import userdata

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/drive/My\ Drive/papers/structuring_dictionaries/annotation_results_to_fix_5000.jsonl .

In [None]:
!ls -lah

total 880K
drwxr-xr-x 1 root root 4.0K Sep 18 16:27 .
drwxr-xr-x 1 root root 4.0K Sep 18 16:25 ..
-rw------- 1 root root 859K Sep 18 16:27 annotation_results_to_fix_5000.jsonl
drwxr-xr-x 4 root root 4.0K Sep 16 13:26 .config
drwx------ 5 root root 4.0K Sep 18 16:27 drive
drwxr-xr-x 1 root root 4.0K Sep 16 13:27 sample_data


In [None]:
annotations_results_path = '/content/drive/My Drive/papers/structuring_dictionaries/structured_yudakhin_001_manual_annotations.jsonl'

In [None]:
import json

filename = annotations_results_path
with open(filename, 'r', encoding='utf-8') as file:
    data = [json.loads(line.strip()) for line in file]

print(type(data[0]))

<class 'dict'>


In [None]:
data[0].keys()

dict_keys(['custom_id', 'text', '_input_hash', '_task_hash', 'tokens', '_view_id', 'answer', '_timestamp', '_annotator_id', '_session_id'])

In [None]:
def parse_jsonl(jsonl_str):
    try:
        return json.loads(jsonl_str)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {jsonl_str}")
        return None

In [None]:
def get_content_from_output_jsonl(jsonl_str):
    parsed_json = parse_jsonl(jsonl_str)
    if parsed_json['error']:
      print(parsed_json['error']['message'])
      return
    custom_id = parsed_json['custom_id']
    content = parsed_json['response']['body']['choices'][0]['message']['content']
    return custom_id, content

In [None]:
BASE_PATH = '/content/drive/My Drive/papers/structuring_dictionaries/batches/'
llm_generated_results = [
    'batch_001_result.jsonl',
    'batch_002_result.jsonl',
    'batch_003_result.jsonl',
    'batch_004_result.jsonl',
    'batch_005_result.jsonl',
]

custom_task_id_to_llm_output_json = {}

In [None]:
# read all items from llm_generated_results array

for llm_output_file in llm_generated_results:
    with open(BASE_PATH + llm_output_file, 'r') as file:
        for line in file:
            custom_id, content = get_content_from_output_jsonl(line)
            custom_task_id_to_llm_output_json[custom_id] = content

In [None]:
def build_annotated_texts_and_labels(sample):
    annotated_texts_and_labels = []
    for span in sample.get('spans', []):
        start = span['start']
        end = span['end']
        text = sample['text']
        annotated_text = text[start:end]
        annotation_label = span['label']
        annotated_texts_and_labels.append(f"{annotation_label}: {annotated_text}")
    return "\n".join(annotated_texts_and_labels)

In [None]:
DELIMITER = "\n-----------------------------------\n"

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['custom_id', 'text', 'orig_json', 'annotated_texts_and_labels', 'annotator_id', 'timestamp'])

In [None]:
# read every item from `data` array and write values to df's corresponding columns
for sample in data:
    custom_id = sample['custom_id']
    text = sample['text']
    orig_json = custom_task_id_to_llm_output_json[custom_id]
    annotated_texts_and_labels = build_annotated_texts_and_labels(sample)

    if len(annotated_texts_and_labels) != 0:
      # we need only correct ones. If there are annotations, then it has errors
      continue

    annotator_id = sample['_annotator_id']
    timestamp = sample['_timestamp']
    new_row = {
        'custom_id': custom_id,
        'text': text.split(DELIMITER)[0],
        'orig_json': orig_json,
        'annotated_texts_and_labels': annotated_texts_and_labels,
        'annotator_id': annotator_id,
        'timestamp': timestamp
    }
    new_row_df = pd.DataFrame([new_row])
    df = pd.concat([df, new_row_df], ignore_index=True)


print(df.shape)
df.head()

(5583, 6)


Unnamed: 0,custom_id,text,orig_json,annotated_texts_and_labels,annotator_id,timestamp
0,task-0,растирать\tнесов.\nсм. растереть.,"{\n ""ru"": ""растирать"",\n ""meta"": ""несов."",\n...",,yudakhin_001-user1,1721756176
1,task-1,похищать\tнесов.\nсм. похитить.,"{\n ""ru"": ""похищать"",\n ""meta"": ""несов."",\n ...",,yudakhin_001-user1,1721756192
2,task-2,"красящий,\t­ая, -ее\n1. прич. от красить;\n2. ...","{\n ""ru"": ""красящий"",\n ""meta"": ""­ая, -ее"",\...",,yudakhin_001-user1,1721756231
3,task-3,материнство\tср.\nэнелик (1. эненин балага сез...,"{\n ""ru"": ""материнство"",\n ""meta"": ""ср."",\n ...",,yudakhin_001-user1,1721756253
4,task-4,"единение\tср.\nбиригишүү, биригүү, биргелешүү;...","{\n ""ru"": ""единение"",\n ""meta"": ""ср."",\n ""k...",,yudakhin_001-user1,1721756281


In [None]:
# Find duplicates in the 'text' column
duplicates = df[df.duplicated(subset=['text'], keep=False)]

# Print the number of duplicate rows
print(f"Number of duplicate rows: {duplicates.shape[0]}")

# Show the duplicate rows
duplicates

Number of duplicate rows: 1801


Unnamed: 0,custom_id,text,orig_json,annotated_texts_and_labels,annotator_id,timestamp
68,task-78,"квартирный,\t­ая, -ое\nквартира 1-ге т.;\nквар...","{\n ""ru"": ""квартирный"",\n ""meta"": ""­ая, -ое""...",,yudakhin_001-user2,1721836253
69,task-79,халтурить\tнесов. неодобр.\nхалтура кылуу (1. ...,"{\n ""ru"": ""халтурить"",\n ""meta"": ""несов. нео...",,yudakhin_001-user2,1721836279
70,task-78,"квартирный,\t­ая, -ое\nквартира 1-ге т.;\nквар...","{\n ""ru"": ""квартирный"",\n ""meta"": ""­ая, -ое""...",,yudakhin_001-user3,1721831466
71,task-79,халтурить\tнесов. неодобр.\nхалтура кылуу (1. ...,"{\n ""ru"": ""халтурить"",\n ""meta"": ""несов. нео...",,yudakhin_001-user3,1721831495
180,task-220,обмачивать\tнесов.\nсм. обмочить.,"{\n ""ru"": ""обмачивать"",\n ""meta"": ""несов."",\...",,yudakhin_001-user4,1721846388
...,...,...,...,...,...,...
5578,task-4582,щах\tпредл. п. от щи.,"{\n ""ru"": ""щах"",\n ""meta"": ""предл. п."",\n ""...",,yudakhin_001-user4,1722237961
5579,task-4583,"многоярусный,\t­ая, -ое\nкөп ярустуу;\nмногояр...","{\n ""ru"": ""многоярусный"",\n ""meta"": ""­ая, -о...",,yudakhin_001-user4,1722238083
5580,task-4585,бал\tм.\nбал (бийлөө менен боло турган зоок ке...,"{\n ""ru"": ""бал"",\n ""meta"": ""м."",\n ""ky"": [\...",,yudakhin_001-user4,1722238168
5581,task-4586,"шестигранный,\t­ая, -ое\nалты кырдуу.","{\n ""ru"": ""шестигранный"",\n ""meta"": ""­ая, -о...",,yudakhin_001-user4,1722238181


In [None]:
# Remove duplicates based on the 'text' column, keeping the first occurrence
df_deduped = df.drop_duplicates(subset=['text'], keep='first')

print(f"Shape of the deduplicated DataFrame: {df_deduped.shape}")

df_deduped.head()

Shape of the deduplicated DataFrame: (4635, 6)


Unnamed: 0,custom_id,text,orig_json,annotated_texts_and_labels,annotator_id,timestamp
0,task-0,растирать\tнесов.\nсм. растереть.,"{\n ""ru"": ""растирать"",\n ""meta"": ""несов."",\n...",,yudakhin_001-user1,1721756176
1,task-1,похищать\tнесов.\nсм. похитить.,"{\n ""ru"": ""похищать"",\n ""meta"": ""несов."",\n ...",,yudakhin_001-user1,1721756192
2,task-2,"красящий,\t­ая, -ее\n1. прич. от красить;\n2. ...","{\n ""ru"": ""красящий"",\n ""meta"": ""­ая, -ее"",\...",,yudakhin_001-user1,1721756231
3,task-3,материнство\tср.\nэнелик (1. эненин балага сез...,"{\n ""ru"": ""материнство"",\n ""meta"": ""ср."",\n ...",,yudakhin_001-user1,1721756253
4,task-4,"единение\tср.\nбиригишүү, биригүү, биргелешүү;...","{\n ""ru"": ""единение"",\n ""meta"": ""ср."",\n ""k...",,yudakhin_001-user1,1721756281


**Export deduped records into a JSON file**

In [None]:
# Select only the 'text' and 'orig_json' columns and rename 'orig_json' to 'json'
df_export = df_deduped[['custom_id', 'text', 'orig_json']].rename(columns={'orig_json': 'json'})

# Export the DataFrame to a JSON file
output_filename = 'structured_json_records_from_unstructured_data.jsonl'
df_export.to_json(output_filename, orient='records', lines=True, force_ascii=False)

print(f"Data successfully exported to {output_filename}")

Data successfully exported to structured_json_records_from_unstructured_data.jsonl


In [None]:
json_schema = {
  "type": "object",
  "properties": {
    "ru": {
      "type": "string"
    },
    "meta": {
      "type": "string"
    },
    "ky": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "description": {
            "type": "object",
            "properties": {
              "ky": {
                "type": "string"
              },
              "ru": {
                "type": "string"
              }
            },
            "required": ["ky", "ru"]
          },
          "translations": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "examples": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "ru": {
                  "type": "string"
                },
                "ky": {
                  "type": "array",
                  "items": {
                    "type": "string"
                  }
                }
              },
              "required": ["ru", "ky"]
            }
          }
        },
        "required": ["description", "translations", "examples"]
      }
    },
    "ref": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "word": {
            "type": "string"
          },
          "description": {
            "type": "object",
            "properties": {
              "ky": {
                "type": "string"
              },
              "ru": {
                "type": "string"
              }
            },
            "required": ["ky", "ru"]
          }
        },
        "required": ["word", "description"]
      }
    }
  },
  "required": ["ru", "meta"]
}

In [None]:
!pip install jsonschema



In [None]:
import json

import jsonschema
from jsonschema import validate

# Define the function to validate a row (sample) against the schema
def validate_json_against_schema(json_obj, schema):
    """
    Validate a JSON object against a given schema.

    Args:
        json_obj (dict): The JSON object to validate.
        schema (dict): The JSON schema to validate against.

    Returns:
        bool: True if valid, False if invalid.
        str: Error message if invalid, otherwise None.
    """
    try:
        # Validate json_obj against the schema
        validate(instance=json_obj, schema=schema)
        return True, None
    except jsonschema.exceptions.ValidationError as err:
        return False, str(err)

corrects = 0
incorrects = 0

for idx, row in df.iterrows():
    json_dict = json.loads(row['orig_json'])
    is_valid, error_message = validate_json_against_schema(json_dict, json_schema)

    if is_valid:
        # print(f"Row {idx} is valid.")
        corrects += 1
    else:
        # print(f"Row {idx} is invalid. Error: {error_message}")
        incorrects += 1

print('corrects: ', corrects)

print('incorrects: ', incorrects)

corrects:  5583
incorrects:  0
