In [None]:
!pip install openai



In [None]:
import random
import numpy as np
from collections import defaultdict
from google.colab import drive
from google.colab import userdata

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
SYSTEM_PROMPT = """You are an assistant tasked with formatting old Russian-Kyrgyz dictionary entries into a computer-readable format.

In the following messages, you will be given partially formatted dictionary entries, which you need to convert into JSON format using the SCHEMA DEFINITION provided below.

SCHEMA DEFINITION:
```
{
  "type": "object",
  "properties": {
    "ru": {
      "type": "string"
    },
    "meta": {
      "type": "string"
    },
    "ky": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "description": {
            "type": "object",
            "properties": {
              "ky": {
                "type": "string"
              },
              "ru": {
                "type": "string"
              }
            },
            "required": ["ky", "ru"]
          },
          "translations": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "examples": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "ru": {
                  "type": "string"
                },
                "ky": {
                  "type": "array",
                  "items": {
                    "type": "string"
                  }
                }
              },
              "required": ["ru", "ky"]
            }
          }
        },
        "required": ["description", "translations", "examples"]
      }
    },
    "ref": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "word": {
            "type": "string"
          },
          "description": {
            "type": "object",
            "properties": {
              "ky": {
                "type": "string"
              },
              "ru": {
                "type": "string"
              }
            },
            "required": ["ky", "ru"]
          }
        },
        "required": ["word", "description"]
      }
    }
  },
  "required": ["ru", "meta", "ky"]
}
```

DETAILS ON FIELDS:
* "ru" string field: This is the dictionary entry's key, representing the word or phrase in Russian. It acts as the main reference for the entry.
* "meta" string field: This field contains metadata about the word, providing details such as part of speech, grammatical gender, and usage notes. These details help understand the grammatical and contextual aspects of the word in Russian. For example: "ср.", "сов.", "несов.", "женск. р.", "сов. разг.", "ж. разг.", "­ая, -ое", "м.", "сов. чего, груб.", "сов. что, чего, разг.", "несов.", "нареч.", "сов. что, чего", "ср.", "несов.", "сов. что чем, офиц.", "несов.", "сов. кого", "м. лит." … etc. In most cases, this information follows the dictionary's key.
* "ky" array: Contains multiple objects each comprising fields for "description", "translations", and "examples" related to the word's usage in Kyrgyz.
    * "translations" array: Lists the translations of the Russian word into Kyrgyz. Each translation is provided as a separate string. This section should include only the translations without any additional comments or explanatory texts. REMEMBER! PUT ONLY KYRGYZ TEXTS.
    * "description" object:
        * "ky": This subfield should contain explanations, comments, or additional information about the word exclusively in Kyrgyz. It should not include translations but may contain context, usage examples, or cultural notes. REMEMBER! PUT HERE ONLY KYRGYZ TEXTS, BUT DO NOT PUT TRANSLATIONS HERE.
        * "ru": This subfield should include explanations, comments, or additional information exclusively in Russian. Like the Kyrgyz description, this should provide context, usage examples, or cultural notes relevant to the Russian language. Can only include text in Russian. REMEMBER! PUT HERE ONLY RUSSIAN TEXTS. Examples that usually go into this field: "полит.", "перен.", "кого", "что", "церк.", "театр." etc.
    * "examples" array: Contains objects that provide example sentences or phrases showcasing the use of the word and corresponding translations:
        * "ru" — Includes an example in Russian.
        * "ky" — Includes translations of the Russian example. In this array multiple examples can be listed to show different usages or nuances.
* "ref" object — Used when referring to another dictionary entry, linking related words or entries to provide additional context. This can help users understand connections between words and see related vocabulary.

EXAMPLES:
———
Entry:
обновление ср.\n1. жаңыруу, жаңыртуу, жаңылоо, жаңылануу;\nобновление методов работы иштин методдорун жаңылоо;\n2. (починка) оңдоо, түзөтүү;\n3. перен. (пополнение) жаңыртуу, калыбына келтирилүү;\nобновление знаний билимин жаңыртуу, билимин калыбына келтирүү.

Comments: Look at the "translations" fields below. Multiple Kyrgyz translations are listed in each of them there.

Computer-readable format:
```
{
  "ru": "обновление",
  "meta":  "ср.",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [
        "жаңыруу",
        "жаңыртуу",
        "жаңылоо",
        "жаңылануу"
      ],
      "examples": [
        {
          "ru": "обновление методов работы",
          "ky": [
            "иштин методдорун жаңылоо"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "починка"
      },
      "translations": [
        "оңдоо",
        "түзөтүү"
      ],
      "examples": [

      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "перен. (пополнение)"
      },
      "translations": [
        "жаңыртуу",
        "калыбына келтирилүү"
      ],
      "examples": [
        {
          "ru": "обновление знаний",
          "ky": [
            "билимин жаңыртуу",
            "билимин калыбына келтирүү"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
анархизм м.\nанархизм (1. полит. майда буржуазиялык реакциялык агым; бул агым мамлекттин кандайынын болсо да, ошонун ичинде пролетариат диктатурасынын да зарылдыгын, пролетардык партиянып уюшулган саясий күрөшүнүн жана жетекчилик ролунун зарылдыгын танат; 2. перен. ээнбаштык, авторитетти тоотпоочулук; тартипке, дисциплинага моюн сунбоочулук).

Comments: Look at the "translations" field below. Since the word has the same translation as the original, Kyrgyz version looks the same.

Computer-readable format:
```
{
  "ru": "анархизм",
  "meta": "м.",
  "ky": [
    {
      "description": {
        "ky": "майда буржуазиялык реакциялык агым; бул агым мамлекттин кандайынын болсо да, ошонун ичинде пролетариат диктатурасынын да зарылдыгын, пролетардык партиянын уюшулган саясий күрөшүнүн жана жетекчилик ролунун зарылдыгын танат",
        "ru": "полит."
      },
      "translations": [
        "анархизм"
      ],
      "examples": []
    },
    {
      "description": {
        "ky": "ээнбаштык, авторитетти тоотпоочулук; тартипке, дисциплинага моюн сунбоочулук",
        "ru": "перен."
      },
      "translations": [
        "анархизм"
      ],
      "examples": []
    }
  ]
}
```
———
Entry:
обновляться несов.\n1. см. обновиться;\n2. страд. к обновлять.

Comment: In the output below, no Kyrgyz translation is provided, but references to other dictionary entries are included.

Computer-readable format:
```
{
  "ru": "обновляться",
  "meta":  "несов.",
  "ref": [
    {
      "word": "обновиться",
      "description": {
        "ky": "",
        "ru": "см."
      }
    },
    {
      "word": "обновлять",
      "description": {
        "ky": "",
        "ru": "страд. к"
      }
    }
  ]
}
```
———
Entry:
чужеземка женск. р. к чужеземец.

Comment: In the example below there's no Kyrgyz translation, but it has a referenc to another dictionary entry.

Computer-readable format:
```
{
  "ru": "чужеземка",
  "meta":  "женск. р.",
  "ref": [
    {
      "word": "чужеземец",
      "description": {
        "ky": "",
        "ru": "к"
      }
    }
  ]
}
```
———
Entry:
обноситься сов. разг.\n1. (износить одежду) кийими жыртылып бүтүү;\n2. (стать удобным) кийилип жүрүп ык алуу, калып алуу;\nваленки обносились кийиз өтүк бутка калып алып калды;\n3. (обветшать от носки) эскирүү, тамтыгы чыгуу;\nплатье обносилось көйнөк эскирди.

Comments: Look at the "translations" field below. Multiple Kyrgyz translations are listed in each of them there.

Computer-readable format:
```
{
  "ru": "обноситься",
  "meta": "сов. разг.",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": "износить одежду"
      },
      "translations": [
        "кийими жыртылып бүтүү"
      ],
      "examples": [

      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "стать удобным"
      },
      "translations": [
        "кийилип жүрүп ык алуу",
        "калып алуу"
      ],
      "examples": [
        {
          "ru": "валенки обносились",
          "ky": [
            "кийиз өтүк бутка калып алып калды"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "обветшать от носки"
      },
      "translations": [
        "эскирүү",
        "тамтыгы чыгуу"
      ],
      "examples": [
        {
          "ru": "платье обносилось",
          "ky": [
            "көйнөк эскирди"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
обновка ж. разг.\nжаңы алынган буюм (мис. жаңы кийим).

Comments: Look the "translations" field below. A Kyrgyz translation listed there.

Computer-readable format:
```
{
  "ru": "обновка",
  "meta": "ж. разг.",
  "ky": [
    {
      "description": {
        "ky": "мис. жаңы кийим",
        "ru": ""
      },
      "translations": [
        "жаңы алынган буюм"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
православный, ­ая, -ое\n1. православие-ге т.;\nправославная вера православие дини;\n2. в знач. сущ. м., ж. православие дининдеги киши.

Comment: In the example below, "translations" is empty in one instance because the translation is not provided separately but is given through the "examples".

Computer-readable format:
```
{
  "ru": "православный",
  "meta": "­ая, -ое",
  "ky": [
    {
      "description": {
        "ky": "православие-ге т.",
        "ru": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "православная вера",
          "ky": [
            "православие дини"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "в знач. сущ. м., ж."
      },
      "translations": [
        "православие дининдеги киши"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
автопарк м.\nавтопарк (автомобилдер туруучу, ремонттолуучу жай).

Comments: Look at the "translations" field below. Since the word has the same translation as the original, Kyrgyz version looks the same.

Computer-readable format:
```
{
  "ru": "автопарк",
  "meta": "м.",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": "автомобилдер туруучу, ремонттолуучу жай"
      },
      "translations": [
        "автопарк"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
нажраться сов. чего, груб.\nтоюу (алпылдап сугунуп, алпылдап жеп-ичип).

Comments: Look at the "translations" field below. A Kyrgyz translation is listed there.

Computer-readable format:
```
{
  "ru": "нажраться",
  "meta": "сов. чего, груб.",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": "алпылдап сугунуп, алпылдап жеп-ичип"
      },
      "translations": [
        "тоюу"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
накидать сов. что, чего, разг.\nчачып таштоо, таштай берүү, ыргыта берүү;\nсм. набросать I.

Comments: Look at the "translations" field below. Multiple Kyrgyz translations are listed there.

Computer-readable format:
```
{
  "ru": "накидать",
  "meta": "сов. что, чего, разг.",
  "ref": [
    {
      "word": "набросать",
      "description": {
        "ru": "см.",
        "ky": ""
      },

    }
  ],
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [
        "чачып таштоо",
        "таштай берүү",
        "ыргыта берүү"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
налегать несов.\n1. см. налечь;\n2. (лежать сверху) кабатталып жатуу;\nгорные породы налегают одна на другую тоо тектери биринин үстүндө бири кабатталып жатат.

Comments: Look at the "translations" field below. A Kyrgyz translation is listed there.

Computer-readable format:
```
{
  "ru": "налегать",
  "meta": "несов.",
  "ref": [
    {
      "word": "налечь",
      "description": {
        "ru": "см.",
        "ky": ""
      }
    }
  ],
  "ky": [
    {
      "description": {
        "ru": "лежать сверху",
        "ky": ""
      },
      "translations": [
        "кабатталып жатуу"
      ],
      "examples": [
        {
          "ru": "горные породы налегают одна на другую",
          "ky": [
            "тоо тектери биринин үстүндө бири кабатталып жатат"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
налево нареч.\nсолго, сол тарапка, сол жакка (в левую сторону); сол тарапта, сол жакта (на левой стороне);\nпрохожий свернул налево жолоочу сол жакка бурулуп кетти;\nналево от дома - лес үйдүн сол жагында токой бар.

Comments: Look at the "translations" fields below. Multiple Kyrgyz translations are listed in each of them.

Computer-readable format:
```
{
  "ru": "налево",
  "meta": "нареч.",
  "ky": [
    {
      "description": {
        "ru": "в левую сторону",
        "ky": ""
      },
      "translations": [
        "солго",
        "сол тарапка",
        "сол жакка"
      ],
      "examples": [
        {
          "ru": "прохожий свернул налево",
          "ky": [
            "жолоочу сол жакка бурулуп кетти"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "на левой стороне",
        "ky": ""
      },
      "translations": [
        "сол тарапта",
        "сол жакта"
      ],
      "examples": [
        {
          "ru": "налево от дома - лес",
          "ky": [
            "үйдүн сол жагында токой бар"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
намариновать сов. что, чего\nмаринаддоо (түрдүү жемишти ачык сууга салуу, ачытуу, кычкылдантуу).

Comments: In the example below, the explanation "(түрдүү жемишти ачык сууга салуу, ачытуу, кычкылдантуу)" has been included in the "ky" section of the dictionary's "description" because it is an explanation of the translation.

Computer-readable format:
```
{
  "ru": "намариновать",
  "meta": "сов. что, чего",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": "түрдүү жемишти ачык сууга салуу, ачытуу, кычкылдантуу"
      },
      "translations": [
        "маринаддоо"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
материнство ср.\nэнелик (1. эненин балага сезими, туйгусу; 2. аялдын боюнда бар, төрөт жана бала эмизүү убагындагы абалы).

Comments: See the "translations" field? A Kyrgyz translation is listed there. Also, see the "description" field's usage example.

Computer-readable format:
```
{
  "ru": "материнство",
  "meta": "ср.",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": "эненин балага сезими, туйгусу; аялдын боюнда бар, төрөт жана бала эмизүү убагындагы абалы"
      },
      "translations": [
        "энелик"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
вертеть несов.\n1. что, чем (вращать) дөңгөлөтүү, айландыруу, тегеретүү;\nвертеть колесо дөңгөлөктү айландыруу (тегеретүү, дөңгөлөтүү);\nвертеть тростью колу менен таякты тегеретүү;\n2. кем-чем, перен. разг. чайкоо, өз билгенин кылып бирөөнү башкаруу.

Comments: In this example, the phrase "вертеть колесо дөңгөлөктү айландыруу (тегеретүү, дөңгөлөтүү)" is provided in the dictionary entry as "вертеть колесо дөңгөлөктү айландыруу (тегерүү, дөңгөлөтүү)", but the words "тегерүү" and "дөңгөлөтүү" are given as alternatives to "айландыруу", meaning they can replace it. Therefore, examples involving those words are listed separately.

Computer-readable format:
```
{
  "ru": "вертеть",
  "meta": "несов.",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [
        "дөңгөлөтүү",
        "айландыруу",
        "тегеретүү"
      ],
      "examples": [
        {
          "ru": "вертеть колесо",
          "ky": [
            "дөңгөлөктү айландыруу",
            "дөңгөлөктү тегеретүү",
            "дөңгөлөктү дөңгөлөтүү"
          ]
        },
        {
          "ru": "вертеть тростью",
          "ky": [
            "колу менен таякты тегеретүү"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "кем-чем, перен. разг.",
        "ky": ""
      },
      "translations": [
        "чайкоо",
        "өз билгенин кылып бирөөнү башкаруу"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
сопроводить сов. что чем, офиц.\nбирге (кошо) жиберүү;\nсопроводить заявление справкой арызды справка менен кошо жиберүү.

Comments: Look at the "translations" field below. Multiple Kyrgyz translation are listed there. Also, see the "description" field's usage example.

Computer-readable format:
```
{
  "ru": "сопроводить",
  "meta": "сов. что чем, офиц.",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [
        "бирге жиберүү",
        "кошо жиберүү"
      ],
      "examples": [
        {
          "ru": "сопроводить заявление справкой",
          "ky": [
            "арызды справка менен кошо жиберүү"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
класть несов.\n1. кого-что коюу;\nкласть на место ордуна коюу;\n2. кого (помещать - напр. в больницу) жаткыруу (мис. төшөккө, ооруканага);\n3. что (напр. в банк) салуу (мис. капка, банкага);\n4. что (возводить, строить) тургузуу, жасоо, салуу (мис. дубалды, мешти);\nкласть в рот толук тушүндүрүү, кулагына куюу;\nкласть яйца жумуртка салуу (канаттуулар менен курт-кумурскалардын ургаачылары жөнүндө);\nкласть начало баштоо;\nкласть основание негиздөө, түптөө;\nкласть печать печать басуу;\nкласть клеймо тамга салуу;\nкласть все силы на работу бардык күчун ишке сарп кылуу.

Comments: In this example, "translations" is empty in some instances because there the translation is given through the "examples". Also, pay attention to how "description" field is used.

Computer-readable format:
```
{
  "ru": "класть",
  "meta": "несов.",
  "ky": [
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [
        "коюу"
      ],
      "examples": [
        {
          "ru": "класть на место",
          "ky": [
            "ордуна коюу"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "кого (помещать - напр. в больницу)",
        "ky": "мис. төшөккө, ооруканага"
      },
      "translations": [
        "жаткыруу"
      ],
      "examples": [

      ]
    },
    {
      "description": {
        "ru": "что (напр. в банк)",
        "ky": "мис. капка, банкага"
      },
      "translations": [
        "салуу"
      ],
      "examples": [

      ]
    },
    {
      "description": {
        "ru": "что (возводить, строить)",
        "ky": "мис. дубалды, мешти"
      },
      "translations": [
        "тургузуу",
        "жасоо",
        "салуу"
      ],
      "examples": [

      ]
    },
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть в рот",
          "ky": [
            "толук тушүндүрүү",
            "кулагына куюу"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "",
        "ky": "канаттуулар менен курт-кумурскалардын ургаачылары жөнүндө"
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть яйца",
          "ky": [
            "жумуртка салуу"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть начало",
          "ky": [
            "баштоо"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть основание",
          "ky": [
            "негиздөө",
            "түптөө"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть печать",
          "ky": [
            "печать басуу"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть клеймо",
          "ky": [
            "тамга салуу"
          ]
        }
      ]
    },
    {
      "description": {
        "ru": "",
        "ky": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "класть все силы на работу",
          "ky": [
            "бардык күчун ишке сарп кылуу"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
антивоенный, ­ая, -ое\nсогушка каршы;\nантивоенная демонстрация согушка каршы демонстрация.

Comments: Pay attention to the "description", "translations" and "examples" and how they are used.

Computer-readable format:
```
{
  "ru": "антивоенный",
  "meta": "­ая, -ое",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [
        "согушка каршы"
      ],
      "examples": [
        {
          "ru": "антивоенная демонстрация",
          "ky": [
            "согушка каршы демонстрация"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
анонимный, анонимдүү, тоголок, туюк (автордун аты жазылбаган);\nанонимное письмо анонимдүү кат, тоголок кат.

Comments: Look at the "translations" field below. Multiple Kyrgyz translation are listed there. Also, see the "description" field's usage example.

Computer-readable format:
```
{
  "ru": "анонимный",
  "meta": "",
  "ky": [
    {
      "description": {
        "ky": "автордун аты жазылбаган",
        "ru": ""
      },
      "translations": [
        "анонимдүү",
        "тоголок",
        "туюк"
      ],
      "examples": [
        {
          "ru": "анонимное письмо",
          "ky": [
            "анонимдүү кат",
            "тоголок кат"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
безграничность ж.\nчексиздик, учу-кыйыры жоктук.

Comments: Look at the "translations" field below. Multiple Kyrgyz translation are listed there.

Computer-readable format:
```
{
  "ru": "безграничность",
  "meta": "ж.",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [
        "чексиздик",
        "учу-кыйыры жоктук"
      ],
      "examples": [

      ]
    }
  ]
}
```
———
Entry:
безбожно нареч. разг.\nыксыз, кудайды карабай, чактабай;\nбезбожно врать ыксыз калп айтуу.

Comments: Look at the "translations" field below. Multiple Kyrgyz translation are listed there. Also, see the "description" field's usage example.

Computer-readable format:
```
{
  "ru": "безбожно",
  "meta": "нареч. разг.",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [
        "ыксыз",
        "кудайды карабай",
        "чактабай"
      ],
      "examples": [
        {
          "ru": "безбожно врать",
          "ky": [
            "ыксыз калп айтуу"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
благодарный ­ая, -ое\n1. ыраазы;\nя вам очень благодарен мен сизге абдан ыраазымын;\n2. ийгиликтүү, убайлуу;\nблагодарный труд ийгиликтүү эмгек.

Comments: Look at the "translations" field below. Multiple Kyrgyz translation are listed in each of them. Also, see the "description" field's usage example.

Computer-readable format:
```
{
  "ru": "благодарный",
  "meta": "­ая, -ое",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [
        "ыраазы"
      ],
      "examples": [
        {
          "ru": "я вам очень благодарен",
          "ky": [
            "мен сизге абдан ыраазымын"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [
        "ийгиликтүү",
        "убайлуу"
      ],
      "examples": [
        {
          "ru": "благодарный труд",
          "ky": [
            "ийгиликтүү эмгек"
          ]
        }
      ]
    }
  ]
}
```
———
Entry:
бок м.\nкаптал, жан; жак (сторона);\nу меня колет в боку менин капталым сайгылашып ооруп турат;\nвьюк свесился на левый бок жүк сол жакка ооп калды;\nбок о бок жанаша, катар;\nлежать на боку разг. (бездельничать) бекер жүрүү;\nпод боком разг. эң жакын, жанында;\nвзять за бока разг. асылуу, кекиртектен алуу;\nпереваливатъся с боку на бок (при ходьбе) оонап басуу, чайкалып басуу, өрдөкчө басуу;\nсомнения по боку! разг. ыргылжыңды кой!

Comments: In this example, "translations" is empty in some instances because there the translation is given through the "examples". Also, pay attention to how "description" field is used.

Computer-readable format:
```
{
  "ru": "бок",
  "meta": "м.",
  "ky": [
    {
      "description": {
        "ky": "",
        "ru": "сторона"
      },
      "translations": [
        "каптал",
        "жан",
        "жак"
      ],
      "examples": [
        {
          "ru": "у меня колет в боку",
          "ky": [
            "менин капталым сайгылашып ооруп турат"
          ]
        },
        {
          "ru": "вьюк свесился на левый бок",
          "ky": [
            "жүк сол жакка ооп калды"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": ""
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "бок о бок",
          "ky": [
            "жанаша",
            "катар"
          ],

        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "разг. (бездельничать)"
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "лежать на боку",
          "ky": [
            "бекер жүрүү"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "разг."
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "под боком",
          "ky": [
            "эң жакын",
            "жанында"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "разг."
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "взять за бока",
          "ky": [
            "асылуу",
            "кекиртектен алуу"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "при ходьбе"
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "переваливатъся с боку на бок",
          "ky": [
            "оонап басуу",
            "чайкалып басуу",
            "өрдөкчө басуу"
          ]
        }
      ]
    },
    {
      "description": {
        "ky": "",
        "ru": "разг."
      },
      "translations": [

      ],
      "examples": [
        {
          "ru": "сомнения по боку!",
          "ky": [
            "ыргылжыңды кой!"
          ]
        }
      ]
    }
  ]
}
```

INSTRUCTIONS:
When dictionary entries are provided, use the SCHEMA DEFINITION to structure the output. Don't forget to follow the format as shown in the EXAMPLES.
REMEMBER: "translations" array should contain only Kyrgyz words or phrases. DO NOT ADD RUSSIAN words and phrases into "translations"! DO NOT INCLUDE USAGE EXAMPLE AS A TRANSLATION!
Output should only contain a JSON structure without any accompanying texts or comments. Wait until an article is given to proceed."""

# New approach (with batch processing)

In [None]:
import pandas as pd
import json
import os
import time
from google.colab import drive
from google.colab import userdata
from openai import OpenAI

# Mount Google Drive
drive.mount('/content/drive')

ARTICLES_FILE = '/content/drive/My Drive/papers/structuring_dictionaries/12000.txt'
SAVE_DIRECTORY = '/content/drive/My Drive/papers/structuring_dictionaries/processed/'
PROGRESS_FILE = '/content/drive/My Drive/papers/structuring_dictionaries/progress.json'

SLEEP_TIME = 0.3  # 0.3 seconds between requests (5000 RPM limit)

def setup_openai_client():
    return OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

def load_articles(num_articles=None):
    with open(ARTICLES_FILE, 'r') as f:
        articles = f.readlines()
    if num_articles is not None:
        return articles[:num_articles]
    return articles

def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as f:
            return json.load(f)
    return {'last_processed_index': 0}

def save_progress(index):
    with open(PROGRESS_FILE, 'w') as f:
        json.dump({'last_processed_index': index}, f)

def process_article(client, word, model_name):
    try:
        response = client.chat.completions.create(
            model=model_name,
            response_format={ "type": "json_object" }, # JSON is being returned!
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": word}
            ],
            temperature=0,
            max_tokens=4096,
            top_p=1
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f'Exception processing word "{word}": {e}')
        return ""

def parse_response(response_text):
    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {response_text}")
        return None

def save_article(word, parsed_response, index):
    if parsed_response:
        data = {
            'word': word.strip(),
            'ru': parsed_response.get('ru', ''),
            'meta': parsed_response.get('meta', ''),
            'ky_json': json.dumps(parsed_response.get('ky', []), ensure_ascii=False)
        }
        df = pd.DataFrame([data])
        filename = f'{SAVE_DIRECTORY}word_{index:05d}.tsv'
        df.to_csv(filename, sep='\t', index=False)
        print(f"Word {index} processed and saved: {word.strip()}")
    else:
        print(f"Failed to process word {index}: {word.strip()}")

def generate_completions(words, model_name):
    client = setup_openai_client()
    total_words = len(words)
    progress = load_progress()
    start_index = progress['last_processed_index']

    try:
        for i in range(start_index, total_words):
            word = words[i].strip()
            response = process_article(client, word, model_name)
            parsed_response = parse_response(response)
            save_article(word, parsed_response, i)
            save_progress(i + 1)
            print(f"Processed {i+1} out of {total_words} words")

            time.sleep(SLEEP_TIME)  # Rate limiting

            if (i + 1) % 10 == 0:  # Ask every 10 words
                if input("Press Enter to continue, or type 'stop' to pause: ").lower() == 'stop':
                    print("Processing paused. You can resume later from this point.")
                    return

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print("Saving current progress before exiting...")
        save_progress(i)
        raise

    print("All words processed and saved.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Parameters
NUM_ARTICLES = 10  # Set to None to process all articles
MODEL_NAME = "gpt-4o"

In [None]:

# # Parameters
# NUM_ARTICLES = 10  # Set to None to process all articles
# MODEL_NAME = "gpt-4o"
# Main execution
# words = load_articles(NUM_ARTICLES)
# generate_completions(words, MODEL_NAME)

# Testing JSON output

In [None]:
client = setup_openai_client()
words = load_articles(5)
res = []
for word in words:
  response = process_article(client, word, MODEL_NAME)
  res.append(response)


print(res)

['{\n  "ru": "растирать",\n  "meta": "несов.",\n  "ref": [\n    {\n      "word": "растереть",\n      "description": {\n        "ky": "",\n        "ru": "см."\n      }\n    }\n  ]\n}', '{\n  "ru": "похищать",\n  "meta": "несов.",\n  "ref": [\n    {\n      "word": "похитить",\n      "description": {\n        "ky": "",\n        "ru": "см."\n      }\n    }\n  ]\n}', '{\n  "ru": "красящий",\n  "meta": "\xadая, -ее",\n  "ky": [\n    {\n      "description": {\n        "ky": "",\n        "ru": "прич. от красить"\n      },\n      "translations": [],\n      "examples": []\n    },\n    {\n      "description": {\n        "ky": "",\n        "ru": "прил."\n      },\n      "translations": [\n        "боёочу",\n        "боёгуч",\n        "боёк түшүрүүчү"\n      ],\n      "examples": [\n        {\n          "ru": "красящие вещества",\n          "ky": [\n            "боёгуч заттар"\n          ]\n        }\n      ]\n    }\n  ]\n}', '{\n  "ru": "материнство",\n  "meta": "ср.",\n  "ky": [\n    {\n      "de

# Script for JSONL batch file preparation

In [None]:
import json
import os
import math

# Configuration
ARTICLES_FILE = '/content/drive/My Drive/papers/structuring_dictionaries/12000.txt'
OUTPUT_DIR = 'output_jsonl'
TASKS_PER_FILE = 1000  # Adjust this number as needed

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_articles():
    with open(ARTICLES_FILE, 'r', encoding='utf-8') as f:
        return f.readlines()

# Load all lines from the file
lines = load_articles()

# Calculate the number of files needed
num_files = math.ceil(len(lines) / TASKS_PER_FILE)

for file_num in range(num_files):
    tasks = []
    start_idx = file_num * TASKS_PER_FILE
    end_idx = min((file_num + 1) * TASKS_PER_FILE, len(lines))

    for index in range(start_idx, end_idx):
        line = lines[index].strip()  # Remove leading/trailing whitespace

        task = {
            "custom_id": f"task-{index}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "temperature": 0,
                "response_format": {
                    "type": "json_object"
                },
                "messages": [
                    {
                        "role": "system",
                        "content": SYSTEM_PROMPT
                    },
                    {
                        "role": "user",
                        "content": line
                    }
                ],
            }
        }

        tasks.append(task)

    # Write tasks to JSONL file
    output_file = os.path.join(OUTPUT_DIR, f'batch_{file_num+1:03d}.jsonl')
    with open(output_file, 'w', encoding='utf-8') as f:
        for task in tasks:
            f.write(json.dumps(task, ensure_ascii=False) + '\n')

    print(f"Created {output_file} with {len(tasks)} tasks")

print(f"Batch preparation complete. Created {num_files} JSONL files in {OUTPUT_DIR}")

Created output_jsonl/batch_001.jsonl with 1000 tasks
Created output_jsonl/batch_002.jsonl with 1000 tasks
Created output_jsonl/batch_003.jsonl with 1000 tasks
Created output_jsonl/batch_004.jsonl with 1000 tasks
Created output_jsonl/batch_005.jsonl with 1000 tasks
Created output_jsonl/batch_006.jsonl with 1000 tasks
Created output_jsonl/batch_007.jsonl with 1000 tasks
Created output_jsonl/batch_008.jsonl with 1000 tasks
Created output_jsonl/batch_009.jsonl with 1000 tasks
Created output_jsonl/batch_010.jsonl with 1000 tasks
Created output_jsonl/batch_011.jsonl with 1000 tasks
Created output_jsonl/batch_012.jsonl with 1000 tasks
Batch preparation complete. Created 12 JSONL files in output_jsonl


In [None]:
# Copy all files from output_jsonl folder to /content/drive/My Drive/papers/structuring_dictionaries/output_jsonl/

!cp -r output_jsonl/* /content/drive/My\ Drive/papers/structuring_dictionaries/

## Script for merging input and output files into one df

In [None]:
DELIMITER = "\n-----------------------------------\n"

In [None]:
import os
import json
import pandas as pd


BASE_PATH = '/content/drive/My Drive/papers/structuring_dictionaries/batches/'
in_out_map = {
    'batch_001.jsonl': 'batch_001_result.jsonl',
    'batch_002.jsonl': 'batch_002_result.jsonl',
    'batch_003.jsonl': 'batch_003_result.jsonl',
    'batch_004.jsonl': 'batch_004_result.jsonl',
    'batch_005.jsonl': 'batch_005_result.jsonl',
}

for in_file, out_file in in_out_map.items():
    in_path = os.path.join(BASE_PATH, in_file)
    out_path = os.path.join(BASE_PATH, out_file)
    # print file stats
    in_size = os.path.getsize(in_path)
    out_size = os.path.getsize(out_path)
    print(f"{in_file}: {in_size} bytes")
    print(f"{out_file}: {out_size} bytes")


def parse_jsonl(jsonl_str):
    try:
        return json.loads(jsonl_str)
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {jsonl_str}")
        return None


def get_dictionary_entry(jsonl_str):
  parsed_json = parse_jsonl(jsonl_str)
  custom_id = parsed_json['custom_id']
  body = parsed_json['body']
  dictionary_entry = body['messages'][1]['content']
  return custom_id, dictionary_entry



def get_content_from_output_jsonl(jsonl_str):
    parsed_json = parse_jsonl(jsonl_str)
    if parsed_json['error']:
      print(parsed_json['error']['message'])
      return
    custom_id = parsed_json['custom_id']
    content = parsed_json['response']['body']['choices'][0]['message']['content']
    return custom_id, content


input_data = [] # Use a list to store data before creating a dataframe
output_data = []

# read the input and output files, get data of interest using `get_dictionary_entry` and `get_content_from_output_jsonl` and write their inputs to corresponding lists
for in_file, out_file in in_out_map.items():
    in_path = os.path.join(BASE_PATH, in_file)
    out_path = os.path.join(BASE_PATH, out_file)
    with open(in_path, 'r', encoding='utf-8') as f:
        for line in f:
            custom_id, dictionary_entry = get_dictionary_entry(line)
            input_data.append({'custom_id': custom_id, 'dictionary_entry': dictionary_entry}) # Append to the list
    with open(out_path, 'r', encoding='utf-8') as f:
        for line in f:
            custom_id, content = get_content_from_output_jsonl(line)
            output_data.append({'custom_id': custom_id, 'content': content}) # Append to the list

# Create dataframes from lists of data
input_df = pd.DataFrame(input_data)
output_df = pd.DataFrame(output_data)

# Merge dataframes on 'custom_id'
dictionary_entry_and_content_df = pd.merge(input_df, output_df, on='custom_id')

# Combine 'dictionary_entry' and 'content' into a single column 'text'
dictionary_entry_and_content_df['text'] = dictionary_entry_and_content_df.apply(
    lambda row: f"{row['dictionary_entry']}{DELIMITER}{row['content']}", axis=1
)

# Display the resulting dataframe with the 'text' column
dictionary_entry_and_content_df = dictionary_entry_and_content_df[['custom_id', 'text']]
print(dictionary_entry_and_content_df)

batch_001.jsonl: 38147870 bytes
batch_001_result.jsonl: 1479687 bytes
batch_002.jsonl: 38154142 bytes
batch_002_result.jsonl: 1518991 bytes
batch_003.jsonl: 38148664 bytes
batch_003_result.jsonl: 1490038 bytes
batch_004.jsonl: 38151375 bytes
batch_004_result.jsonl: 1487152 bytes
batch_005.jsonl: 38139356 bytes
batch_005_result.jsonl: 1444635 bytes
      custom_id                                               text
0        task-0  растирать\tнесов.\nсм. растереть.\n-----------...
1        task-1  похищать\tнесов.\nсм. похитить.\n-------------...
2        task-2  красящий,\t­ая, -ее\n1. прич. от красить;\n2. ...
3        task-3  материнство\tср.\nэнелик (1. эненин балага сез...
4        task-4  единение\tср.\nбиригишүү, биригүү, биргелешүү;...
...         ...                                                ...
4993  task-4995  коротышка\tм. и ж.\nсм. коротыш.\n------------...
4994  task-4996  боярышня\tж. ист.\nбоярышня (бояриндин күйөөгө...
4995  task-4997  провожать\tнесов.\nсм. проводи

## Write to JSONL

In [None]:
# dump dictionary_entry_and_content_df into a jsonl file
def write_to_jsonl(df, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            json_str = json.dumps(row.to_dict(), ensure_ascii=False)
            f.write(json_str + '\n')


In [None]:
output_file = 'dataset_for_annotation_5000.jsonl'
write_to_jsonl(dictionary_entry_and_content_df, output_file)

In [None]:
!cp dataset_for_annotation_5000.jsonl /content/drive/My\ Drive/papers/structuring_dictionaries/batches/

## Next step is to annotate the `dataset_for_annotation_5000.jsonl` using prodigy annotation tool

## Clean up items and pick items that need manual fix

In [None]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
annotations_results_path = '/content/drive/My Drive/papers/structuring_dictionaries/structured_yudakhin_001_manual_annotations.jsonl'

In [None]:
import json

# JSONL файлын окуу жана ар бир сапты JSON объекти катары парс кылуу
data = []
with open(annotations_results_path, 'r') as file:
    for line in file:
        data.append(json.loads(line.strip()))

In [None]:
# Парс кылынган маалыматтарды карап чыгуу
for record in data[:10]:
    print(record)

print('num of items', len(data))

{'custom_id': 'task-0', 'text': 'растирать\tнесов.\\nсм. растереть.\n-----------------------------------\n{\n  "ru": "растирать",\n  "meta": "несов.",\n  "ref": [\n    {\n      "word": "растереть",\n      "description": {\n        "ky": "",\n        "ru": "см."\n      }\n    }\n  ]\n}', '_input_hash': -1033217742, '_task_hash': 1889584374, 'tokens': [{'text': 'растирать', 'start': 0, 'end': 9, 'id': 0, 'ws': False}, {'text': '\t', 'start': 9, 'end': 10, 'id': 1, 'ws': False}, {'text': 'несов.\\nсм', 'start': 10, 'end': 20, 'id': 2, 'ws': False}, {'text': '.', 'start': 20, 'end': 21, 'id': 3, 'ws': True}, {'text': 'растереть', 'start': 22, 'end': 31, 'id': 4, 'ws': False}, {'text': '.', 'start': 31, 'end': 32, 'id': 5, 'ws': False}, {'text': '\n', 'start': 32, 'end': 33, 'id': 6, 'ws': False}, {'text': '-----------------------------------', 'start': 33, 'end': 68, 'id': 7, 'ws': False}, {'text': '\n', 'start': 68, 'end': 69, 'id': 8, 'ws': False}, {'text': '{', 'start': 69, 'end': 70, '

In [None]:
sample = data[9]
for key, value in sample.items():
    print(f"{key}: {value}")

custom_id: task-9
text: печься	печься I\nнесов.\n1. (о хлебе и т.п.) бышуу, какталып бышырылуу (мис. токон, самса);\n2. (напр. на солнце) какталуу, ысуу (мас. аптапка);\n3. страд. к печь I 1.\nпечься II\nнесов. о ком-чём, разг.\n(заботиться) камын жеш, кам көрүү;\nона очень печётся о своём ребёнке ал (аял) өзүнүн баласы жөнүндө өтө камкордук кылат.
-----------------------------------
{
  "ru": "печься",
  "meta": "несов.",
  "ky": [
    {
      "description": {
        "ru": "о хлебе и т.п.",
        "ky": "мис. токон, самса"
      },
      "translations": [
        "бышуу",
        "какталып бышырылуу"
      ],
      "examples": []
    },
    {
      "description": {
        "ru": "напр. на солнце",
        "ky": "мас. аптапка"
      },
      "translations": [
        "какталуу",
        "ысуу"
      ],
      "examples": []
    }
  ],
  "ref": [
    {
      "word": "печь",
      "description": {
        "ru": "страд. к",
        "ky": ""
      }
    }
  ]
}

_input_hash: -847074412
_t

In [None]:
sample['spans']

[{'start': 165,
  'end': 326,
  'token_start': 42,
  'token_end': 72,
  'label': 'Жетишпейт'}]

In [None]:
start = sample['spans'][0]['start']
end = sample['spans'][0]['end']
text = sample['text']
print(text[start:end])

1.\nпечься II\nнесов. о ком-чём, разг.\n(заботиться) камын жеш, кам көрүү;\nона очень печётся о своём ребёнке ал (аял) өзүнүн баласы жөнүндө өтө камкордук кылат.


In [None]:
def build_annotated_texts_and_labels(sample):
    annotated_texts_and_labels = []
    for span in sample.get('spans', []):
        start = span['start']
        end = span['end']
        text = sample['text']
        annotated_text = text[start:end]
        annotation_label = span['label']
        annotated_texts_and_labels.append(f"{annotation_label}: {annotated_text}")
    return "\n".join(annotated_texts_and_labels)

In [None]:
BASE_PATH = '/content/drive/My Drive/papers/structuring_dictionaries/batches/'
llm_generated_results = [
    'batch_001_result.jsonl',
    'batch_002_result.jsonl',
    'batch_003_result.jsonl',
    'batch_004_result.jsonl',
    'batch_005_result.jsonl',
]

custom_task_id_to_llm_output_json = {}

In [None]:
# read all items from llm_generated_results array

for llm_output_file in llm_generated_results:
    with open(BASE_PATH + llm_output_file, 'r') as file:
        for line in file:
            custom_id, content = get_content_from_output_jsonl(line)
            custom_task_id_to_llm_output_json[custom_id] = content

In [None]:
custom_task_id_to_llm_output_json['task-4999']

'{\n  "ru": "чистосортный",\n  "meta": "\xadая, -ое",\n  "ky": [\n    {\n      "description": {\n        "ky": "",\n        "ru": ""\n      },\n      "translations": [\n        "таза сорттуу"\n      ],\n      "examples": [\n        {\n          "ru": "чистосортное зерно",\n          "ky": [\n            "таза сорттуу дан"\n          ]\n        }\n      ]\n    }\n  ]\n}'

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['custom_id', 'text', 'orig_json', 'annotated_texts_and_labels', 'annotator_id', 'timestamp'])

In [None]:
# read every item from `data` array and write values to df's corresponding columns
for sample in data:
    custom_id = sample['custom_id']
    text = sample['text']
    orig_json = custom_task_id_to_llm_output_json[custom_id]
    annotated_texts_and_labels = build_annotated_texts_and_labels(sample)

    if len(annotated_texts_and_labels) == 0:
      continue

    annotator_id = sample['_annotator_id']
    timestamp = sample['_timestamp']
    new_row = {
        'custom_id': custom_id,
        'text': text.split(DELIMITER)[0],
        'orig_json': orig_json,
        'annotated_texts_and_labels': annotated_texts_and_labels,
        'annotator_id': annotator_id,
        'timestamp': timestamp
    }
    new_row_df = pd.DataFrame([new_row])
    df = pd.concat([df, new_row_df], ignore_index=True)


print(df.shape)
df.head()

(526, 6)


Unnamed: 0,custom_id,text,orig_json,annotated_texts_and_labels,annotator_id,timestamp
0,task-9,печься\tпечься I\nнесов.\n1. (о хлебе и т.п.) ...,"{\n ""ru"": ""печься"",\n ""meta"": ""несов."",\n ""...","Жетишпейт: 1.\nпечься II\nнесов. о ком-чём, ра...",yudakhin_001-user1,1721756480
1,task-16,комбинировать\tнесов.\n1. что (соединять) комб...,"{\n ""ru"": ""комбинировать"",\n ""meta"": ""несов....",Жетишпейт: что,yudakhin_001-user1,1721792232
2,task-18,"скрепление\tср.\n1. бириктирүү, эптештирүү;\nс...","{\n ""ru"": ""скрепление"",\n ""meta"": ""ср."",\n ...",Ашыкча берилген: (мис. кол коюп)\nАшыкча берил...,yudakhin_001-user1,1721792387
3,task-19,якутка\tженск. р. к якут;\nякутка.,"{\n ""ru"": ""якутка"",\n ""meta"": ""женск. р."",\n...",Жетишпейт: якут;\nякутка,yudakhin_001-user1,1721792476
4,task-27,"слабосильный,\t­ая, -ое\n1. (о людях, животных...","{\n ""ru"": ""слабосильный"",\n ""meta"": ""­ая, -о...","Ашыкча берилген: бейкаруу, алы жок, күчү жок",yudakhin_001-user1,1721792699


In [None]:
df['text'][0]

'печься\tпечься I\\nнесов.\\n1. (о хлебе и т.п.) бышуу, какталып бышырылуу (мис. токон, самса);\\n2. (напр. на солнце) какталуу, ысуу (мас. аптапка);\\n3. страд. к печь I 1.\\nпечься II\\nнесов. о ком-чём, разг.\\n(заботиться) камын жеш, кам көрүү;\\nона очень печётся о своём ребёнке ал (аял) өзүнүн баласы жөнүндө өтө камкордук кылат.'

In [None]:
output_file = 'annotation_results_to_fix_5000.jsonl'
df['html'] = ' '

write_to_jsonl(df, output_file)

# read a row #123 from df
df.iloc[123]

custom_id                                                              task-749
text                          ведомо:\tс ведома билдирип, айтып;\nбез ведома...
orig_json                     {\n  "ru": "ведомо",\n  "meta": "",\n  "ky": [...
annotated_texts_and_labels                                         Жетишпейт: с
annotator_id                                                 yudakhin_001-user1
timestamp                                                            1721928317
html                                                                           
Name: 123, dtype: object

In [None]:
!cp annotation_results_to_fix_5000.jsonl /content/drive/My\ Drive/papers/structuring_dictionaries/