In [16]:
import os
from pathlib import Path
from lxml import etree


import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd

with open('.env') as f:
    openai.api_key = f.read().strip().split('=')[1]



In [104]:
filename="data/udhr/udhr_afr.xml"

udhr_dir = Path('data/udhr')

rows = []

for g in sorted(udhr_dir.glob('*.xml')):
    stem = g.stem[5:]

    with open(g) as f:
        tree = etree.parse(f)

    name = tree.getroot().get('n')
    print(name, end='; ')
    article19 = tree.find('{*}article[@number="19"]/{*}para')
    if article19 is not None:
        article19 = article19.text

    rows.append({'code': stem, 'name': name, 'text': article19})

Languages = pd.DataFrame(rows).set_index('code')
Languages['identification'] = ''
Languages['reason'] = ''
Languages[:50]

Afrikaans; Albanian, Tosk; Arabic, Standard; Azerbaijani, North (Latin); Belarusan; Bengali; Breton; Bulgarian; Catalan-Valencian-Balear; Cebuano; Czech; Chinese, Mandarin (Simplified); Chinese, Mandarin (Traditional); Corsican; Welsh; Danish; German, Standard (1996); Greek (monotonic); English; Esperanto; Estonian; Basque; Faroese; Finnish; French; Frisian, Western; Friulian; Gaelic, Irish; Galician; Haitian Creole French (Kreyol); Hausa; Hawaiian; Hebrew; Hindi; Croatian; Sorbian, Upper; Hungarian; Armenian; Igbo; Interlingua; Indonesian; Icelandic; Italian; Javanese (Latin); Japanese; Georgian; Kazakh; Kabardian; Mongolian, Halh (Cyrillic); Kirghiz; Khakas; Kurdish, Northern; Korean; Ladino; Latin; Latvian; Lingala; Lithuanian; Luxembourgeois; Marathi; Macedonian; Mixtec, Metlatónoc; Navajo; Saxon, Low; Nahuatl, Central; Dutch; Norwegian, Nynorsk; Norwegian, Bokmål; Panjabi, Eastern; Papiamentu; Pidgin, Nigerian; Farsi, Western; Polish; Portuguese (Brazil); Portuguese (Portugal); K'

Unnamed: 0_level_0,name,text,identification,reason
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
afr,Afrikaans,Elkeen het die reg tot vryheid van opinie en u...,,
als,"Albanian, Tosk",Gjithkush ka të drejtën e lirisë së mendimit d...,,
arb,"Arabic, Standard",لكل شخص الحق في حرية الرأي والتعبير. ويشمل هذا...,,
azj_latn,"Azerbaijani, North (Latin)",Hər bir şəxs əqidə və onu sərbəst ifadə etmək ...,,
bel,Belarusan,Кожны чалавек мае права на свабоду пераканання...,,
ben,Bengali,প্রত্যেকের‌ই মতামত পোষণ এবং মতামত প্রকাশের স্ব...,,
bre,Breton,Pep hini en deus gwir d’ar frankiz d’ober e ve...,,
bul,Bulgarian,Всеки човек има право на свобода на убеждение ...,,
cat,Catalan-Valencian-Balear,Tota persona té dret a la llibertat d’opinió i...,,
ceb,Cebuano,Ang matag usa adunay katungod sa hunahuna ug p...,,


In [59]:
Languages.loc['khk'].text

'Хүн бүр өөрийн зэл бодолтой байж, түүнийгээ чөлөөтэй илэрхийлэх эрхтэй, үүнд үзэл бодлоо ямар ч хориг цээргүй баримтлах, аливаа үзэл санаа, мэдээллийг улсын хилийн заагаар үл хязгаарлан боломжтой арга замаар эрж сурвалжлах, олж ашиглах, түгээн дэлгэрүүлэх эрх багтана.'

In [60]:
def identifyLanguage(code):
  sample_prompt = "Tout individu a droit à la liberté d’opinion et d’expression, ce qui implique le droit de ne pas être inquiété pour ses opinions et celui de chercher, de recevoir et de répandre, sans considérations de frontières, les informations et les idées par quelque moyen d’expression que ce soit."
  custom_prompt = Languages.loc[code].text

  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    max_tokens=20,
    messages=[
      {"role": "system", "content": "You are a language identification system."},
      {"role": "user", "content": f"{sample_prompt}"},
      {"role": "assistant", "content": "French" },
      {"role": "user", "content": f"{custom_prompt}"}
    ]
  )

  return completion.choices[0]['message']['content']

In [102]:
def identifyLanguageWithReason(code):
  sample_prompt = "Tout individu a droit à la liberté d’opinion et d’expression, ce qui implique le droit de ne pas être inquiété pour ses opinions et celui de chercher, de recevoir et de répandre, sans considérations de frontières, les informations et les idées par quelque moyen d’expression que ce soit."
  custom_prompt = Languages.loc[code].text

  messages = [
      {"role": "system", "content": "You are a language identification system."},
      {"role": "user", "content": f"{sample_prompt}"},
      {"role": "assistant", "content": "French" },
      {"role": "user", "content": f"{custom_prompt}"}
    ]

  completion1 = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    max_tokens=20,
    messages=messages
  )

  response1 = completion1.choices[0]['message']['content']

  completion2 = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    n=3,
    max_tokens=256,
    messages=messages + [
      {"role": "assistant", "content": response1},
      {"role": "user", "content": "Explain your reasoning for the last one."}
    ]
  )

  return (response1, sorted([choice['message']['content'] for choice in completion2.choices], key=len)[-1])

identifyLanguageWithReason('als')


('Albanian.',
 'The text seems to be a translation of the second sentence of Article 19 of the Universal Declaration of Human Rights, which states:\n\n"Everyone has the right to freedom of opinion and expression; this right includes freedom to hold opinions without interference and to seek, receive and impart information and ideas through any media and regardless of frontiers."\n\nThe text matches the structure and wording of the original sentence and uses similar language to express the same ideas. Additionally, the use of diacritics and the presence of certain words such as "kërkimit" are characteristic of Albanian.')

In [69]:
for code in Languages.index:
    print(code, end=' ')

    try:
        language, reason = identifyLanguageWithReason(code)
        Languages.loc[code]['identification'] = language
        Languages.loc[code]['reason'] = reason
    except:
        print('!!!')

afr als arb azj_latn bel ben bre bul cat ceb ces cmn_hans cmn_hant cos cym dan deu_1996 ell_monotonic eng epo est eus fao fin fra fri fur gle glg hat_kreyol hau_3 haw heb hin hrv hsb hun hye ibo ina ind isl ita jav jpn kat kaz kbd khk kir kjh kmr kor lad lat lav lin lit ltz mar mkd mxv nav nds nhn nld nno nob pan pap pcm pes_1 pol por_BR por_PT quc rmy roh ron_2006 run rus sah sco slk slv spa src srp_cyrl srp_latn swe swh tam tel tgk tgl tha tur tyv uig_latn ukr urd uzn_cyrl vec vie wol xho yor ztu zul 

In [70]:
Languages

Unnamed: 0_level_0,name,text,identification
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
afr,Afrikaans,Elkeen het die reg tot vryheid van opinie en u...,Afrikaans
als,"Albanian, Tosk",Gjithkush ka të drejtën e lirisë së mendimit d...,Albanian
arb,"Arabic, Standard",لكل شخص الحق في حرية الرأي والتعبير. ويشمل هذا...,Arabic
azj_latn,"Azerbaijani, North (Latin)",Hər bir şəxs əqidə və onu sərbəst ifadə etmək ...,Azeri (Azerbaijani)
bel,Belarusan,Кожны чалавек мае права на свабоду пераканання...,Belarusian
...,...,...,...
wol,Wolof,Nit ku ne am na sañ-sañ wax mbaa bind lu ko so...,"This is Wolof, a language spoken in Senegal, T..."
xho,Xhosa,Wonke umntu unelungelo lokucinga nokuthetha ng...,Xhosa
yor,Yoruba,Ẹnì kọ̀ọ̀kan ló ní ẹ̀tọ́ sí òmì nira láti ní ì...,Yoruba.
ztu,"Zapotec, Güilá",Ra'ta' bu:unny ra:a'p deree'ch pahr ga:a'p opi...,This seems to be a transcription of the origin...


In [81]:
Languages[['name', 'identification', 'text']].to_csv('data/language_id.csv')