### 0) Setting up environment

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install fasttext
!pip install nltk

### 1) Getting the data

In [None]:
import fasttext
from sklearn.model_selection import train_test_split
import nltk
import csv
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:


# labels for all languages
labels_filename = '/content/gdrive/My Drive/wili-2018/labels.csv'

with open(labels_filename,'r') as data: 
  reader = csv.reader(data)
  keys = ((next(reader))[0]).split(';')  

  labels_dict = {}
  for line in reader: 

    # 'Label', 'English', 'Wiki Code', 'ISO 369-3', 'German', 'Language family', 'Writing system', 'Remarks', 'Synonyms'
    label, lang, wiki_code, _, _, lang_family, _, _, _ = (line[0]).split(';') 

    lang_dict = {'lang': lang, 'wiki_code': wiki_code, 'lang_family': lang_family}
    labels_dict[label] = lang_dict

print("label dictionary: ", labels_dict)

label dictionary:  {'ace': {'lang': 'Achinese', 'wiki_code': 'ace', 'lang_family': 'Austronesian'}, 'afr': {'lang': 'Afrikaans', 'wiki_code': 'af', 'lang_family': 'Indo-European'}, 'als': {'lang': 'Alemannic German', 'wiki_code': 'als', 'lang_family': 'Indo-European'}, 'amh': {'lang': 'Amharic', 'wiki_code': 'am', 'lang_family': 'Afro-Asiatic'}, 'ang': {'lang': 'Old English ', 'wiki_code': 'ang', 'lang_family': 'Indo-European'}, 'ara': {'lang': 'Arabic', 'wiki_code': 'ar', 'lang_family': 'Afro-Asiatic'}, 'arg': {'lang': 'Aragonese', 'wiki_code': 'an', 'lang_family': 'Indo-European'}, 'arz': {'lang': 'Egyptian Arabic', 'wiki_code': 'arz', 'lang_family': 'Afro-Asiatic'}, 'asm': {'lang': 'Assamese', 'wiki_code': 'as', 'lang_family': 'Indo-European'}, 'ast': {'lang': 'Asturian', 'wiki_code': 'ast', 'lang_family': 'Indo-European'}, 'ava': {'lang': 'Avar', 'wiki_code': 'av', 'lang_family': 'Northeast Caucasian'}, 'aym': {'lang': 'Aymara', 'wiki_code': 'ay', 'lang_family': 'Aymaran'}, 'azb': 

In [None]:
# dict that maps name to lang code
name_to_label = {}
for label in labels_dict:
  name = labels_dict[label]['lang']
  name_to_label[name] = label

# example
name_to_label['Udmurt']

'udm'

In [None]:
# save train data into two lists, labels and articles
x_train_path = '/content/gdrive/My Drive/wili-2018/x_train.txt'
y_train_path = '/content/gdrive/My Drive/wili-2018/y_train.txt'

with open(x_train_path, 'r') as fp:
  x_train_list = fp.read().split('\n')

with open(y_train_path, 'r') as fp:
  y_train_list = fp.read().split('\n')


# save test data into two lists, labels and articles
x_test_path = '/content/gdrive/My Drive/wili-2018/x_test.txt'
y_test_path = '/content/gdrive/My Drive/wili-2018/y_test.txt'

with open(x_test_path, 'r') as fp:
  x_test_list = fp.read().split('\n')

with open(y_test_path, 'r') as fp:
  y_test_list = fp.read().split('\n')

# NOTE: test and train data are split 50/50 originally
print(len(x_test_list))
print(len(x_train_list))

117501
117501


In [None]:
# combine provided train/test into single list
x_list = x_train_list + x_test_list
y_list = y_train_list + y_test_list

print(len(x_list))
print(len(y_list))

235002
235002


In [None]:
# combine x,y lists into list of tuples: (label, text)
list_double = [ (label, text) for text, label in zip(x_list, y_list)]

# dict of languages to list of articles for each respective lang
lang_dict = {}
for key in labels_dict:
   article_list = []
   for label, text in list_double:
     if key == label:
       article_list.append(text)
   lang_dict[key] = article_list

# example
lang_dict['ukr'][:5]

["Він веде успішну кар'єру вокаліста: бере участь у концертах та збірниках, і випускає сольні альбоми та сингли. Крім того, він виконує теми в «Анжеліку» OVA-3, «Таємничої грі: Легенда Вічного Світла» OVA-3 і «Білому Хресті»",
 'Протягом цього часу авторство багатьох композиції в репертуарі групи приписувалося Nanker Phelge — що означало, що вона є плодом спільних зусиль Джаггера-Джонса-Річардса-Уоттса-Вайман.',
 'Про затвердження переліку автомобільних доріг загального користування державного значення: Кабінет Міністрів України; Постанова, Перелік від 16.09.2015 № 712',
 'Рональд Рейган — єдиний президент Сполучених Штатів, який має власну зірку (за роль у фільмі Ковбой з Брукліна), а також один з двох губернаторів Каліфорнії (другим став Арнольд Шварценеггер).',
 'Артемов А. А Памятники градостроительства и архитектуры Украинской ССР. Иллюстрированный справочник-каталог. В 4-х т. Т.2. Винницкая, Волынская, Ворошиловградская, Днепропетровская, Донецкая, Житомирская, Закарпатская, запо

### 2) FastText Pretrained Model - Baseline Comparision

In [None]:
# langs to include in data since 235 is too many
# top 10 langs minus Mandarin and Japanese (for sake of tokenization)
# English, Spanish, Russian, Bengali, Portuquese, Punjabi, Hindi
lang_list = ['eng', 'spa', 'rus', 'ben', 'por', 'pan', 'hin']

In [None]:
# get data only for languages in list
x = []
y = []
for text, label in zip(x_list, y_list):
  if label in lang_list:
    x.append(text)
    y.append(label)

# split into train and test data - 80/20
x_train, x_test, y_train, y_test = train_test_split(x_list, y_list, test_size = 0.2, stratify = y_list)

In [None]:
# build train data.txt in label text format
output_file_1 = open('baseline_train_data.txt', 'w')
for text, label in zip(x_train, y_train):
  line = '__label__' + label + ' ' + text
  output_file_1.write(line + '\n')

In [None]:
# build test data.txt
output_file_2 = open('baseline_test_data.txt', 'w')
for text, label in zip(x_test, y_test):
  line = '__label__' + label + ' ' + text
  output_file_2.write(line + '\n')

In [None]:
# model_pretrained = fasttext.train_supervised('baseline_train_data.txt')

In [None]:
# print(model_pretrained.test('baseline_test_data.txt'))
# (46998, 0.8701859653602281, 0.8701859653602281)

### 2) FastText with 10 Most Common Unigrams + Bag-of-Words


In [None]:
import fasttext
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# langs to include in data since 235 is too many
# top 10 langs minus Mandarin and Japanese (for sake of tokenization)
# English, Spanish, Russian, Bengali, Portuquese, Punjabi, Hindi
lang_list = ['eng', 'spa', 'rus', 'ben', 'por', 'pan', 'hin']

In [None]:
# get data only for languages in list
x = []
y = []
for text, label in zip(x_list, y_list):
  if label in lang_list:
    x.append(text)
    y.append(label)

# resplit into desired breakdown - 80/20
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [None]:
# tokenize given doc - lowercase, discard of nums and punctuation, discard long tokens
def tokenize(document):
  tokens = nltk.word_tokenize(document)
  words = [tok.lower() for tok in tokens if (tok.isalpha()) and (len(tok) < 10)]
  return words

In [None]:
# build features based on x_train data
# iterate thru langs, find 10 most common tokens for each
word_features = []
for lang in lang_list:
  articles = []
  for label, text in zip(y_train, x_train):
    if label == lang:
      articles.append(text)
  document = ' '.join(articles)
  tokens = tokenize(document)

  lang_fd = nltk.FreqDist(tokens)
  most_common = lang_fd.most_common(10)

  for pair in most_common:
    word = pair[0]
    word_features.append(word)

# remove duplicate tokens from feature list
word_features = sorted(list(set(word_features)))

In [None]:
# get train text file
output_file_1 = open('train_data.txt', 'w')
for text, label in zip(x_train, y_train):
  # tokenize text
  tokens = tokenize(text)

  # iterate through feature words
  # get num occurrences and add
  feature_list = []
  for num, word in enumerate(word_features):
    count = tokens.count(word)
    feature_list.append('ftr' + str(num) + ':' + str(count))

  # get complete line, write to file
  feature_line = ' '.join(feature_list)
  lable_line = '__label__' + label 
  line = lable_line + ' ' + feature_line

  output_file_1.write(line + '\n')

In [None]:
# get test text file
output_file_2 = open('test_data.txt', 'w')
for text, label in zip(x_train, y_train):
  # tokenize text
  tokens = tokenize(text)
  
  # iterate through feature words
  # get num occurrences and add
  feature_list = []
  for num, word in enumerate(word_features):
    count = tokens.count(word)
    feature_list.append('ftr' + str(num) + ':' + str(count))

  # get complete line, write to file
  feature_line = ' '.join(feature_list)
  lable_line = '__label__' + label 
  line = lable_line + ' ' + feature_line

  output_file_2.write(line + '\n')

In [None]:
# model_bow = fasttext.train_supervised('train_data.txt', epoch=20)

In [None]:
# answer = model_bow.test('test_data.txt')
# print(answer)
# (5593, 0.9429644198104774, 0.9429644198104774)

### 3) Putting everything together: 3 Resulting Models

In [None]:
"""
Get accuracy using given set of languages
Using pretrained model
params:
lang_list     - languages to include in model
x_list        - full x data from Wikipedia
y_list        - full y data from Wikipedia
"""
def accuracy_for_pretrained(lang_list, x_list, y_list):
  # get data only for languages in list
  x = []
  y = []
  for text, label in zip(x_list, y_list):
    if label in lang_list:
      x.append(text)
      y.append(label)

  # split into train and test data - 80/20
  x_train, x_test, y_train, y_test = train_test_split(x_list, y_list, test_size = 0.2, stratify = y_list)

  # build train data.txt in label text format
  output_file_1 = open('baseline_train_data.txt', 'w')
  for text, label in zip(x_train, y_train):
    line = '__label__' + label + ' ' + text
    output_file_1.write(line + '\n')

  # build test set
  output_file_2 = open('baseline_test_data.txt', 'w')
  for text, label in zip(x_test, y_test):
    line = '__label__' + label + ' ' + text
    output_file_2.write(line + '\n')

  model = fasttext.train_supervised('baseline_train_data.txt')
  answer = model.test('baseline_test_data.txt')

  return answer[1]

In [None]:
"""
Get accuracy using given set of languages
Features: most common words
params:
lang_list     - languages to include in model
num_common    - num most common words in features
x_list        - full x data from Wikipedia
y_list        - full y data from Wikipedia
"""
def accuracy_for_langs(lang_list, num_common, x_list, y_list):
  x = []
  y = []
  for text, label in zip(x_list, y_list):
    if label in lang_list:
      x.append(text)
      y.append(label)

  # resplit into desired breakdown
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
  
  # get most common words in train set
  word_features = []
  for lang in lang_list:
    articles = []
    for label, text in zip(y_train, x_train):
      if label == lang:
        articles.append(text)
    document = ' '.join(articles)
    
      tokens = nltk.word_tokenize(document)
      tokens = [tok.lower() for tok in tokens if (tok.isalpha()) and (len(tok) < 10)]

    lang_fd = nltk.FreqDist(tokens)
    most_common = lang_fd.most_common(num_common)

    for pair in most_common:
      word = pair[0]
      word_features.append(word)

  # sort features and remove all duplicates
  word_features = sorted(list(set(word_features)))
  
  # get train text file
  output_file_1 = open('train_data.txt', 'w')
  for text, label in zip(x_train, y_train):
    # tokenize text
    tokens = tokenize(text)

    # iterate through feature words
    # get num occurrences and add
    feature_list = []
    for num, word in enumerate(word_features):
      count = tokens.count(word)
      feature_list.append('ftr' + str(num) + ':' + str(count))

    # get complete line, write to file
    feature_line = ' '.join(feature_list)
    lable_line = '__label__' + label 
    line = lable_line + ' ' + feature_line

    output_file_1.write(line + '\n')

  # get test text file
  output_file_2 = open('test_data.txt', 'w')
  for text, label in zip(x_train, y_train):
    # tokenize text
    tokens = tokenize(text)
    
    # iterate through feature words
    # get num occurrences and add
    feature_list = []
    for num, word in enumerate(word_features):
      count = tokens.count(word)
      feature_list.append('ftr' + str(num) + ':' + str(count))

    # get complete line, write to file
    feature_line = ' '.join(feature_list)
    lable_line = '__label__' + label 
    line = lable_line + ' ' + feature_line

    output_file_2.write(line + '\n')

  # train model and get accurracy for test data
  model = fasttext.train_supervised('train_data.txt', epoch=20)
  answer = model.test('test_data.txt')

  return answer[1]

In [None]:
"""
Get accuracy using given set of languages
Features: most common words + endings
params:
lang_list     - languages to include in model
num_common    - num most common words in features
x_list        - full x data from Wikipedia
y_list        - full y data from Wikipedia
"""
def better_accuracy_for_langs(lang_list, num_common, x_list, y_list):
  x = []
  y = []
  for text, label in zip(x_list, y_list):
    if label in lang_list:
      x.append(text)
      y.append(label)

  # resplit into desired breakdown
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)
  
  # get most common words in train set
  word_features = []
  ending_features = []
  for lang in lang_list:
    articles = []
    for label, text in zip(y_train, x_train):
      if label == lang:
        articles.append(text)
    document = ' '.join(articles)
    
    # tokenize
    tokens = nltk.word_tokenize(document)
    tokens = [tok.lower() for tok in tokens if (tok.isalpha())]

    # get most common words
    short_tokens = [tok for tok in tokens if len(tok)<10]
    lang_fd = nltk.FreqDist(short_tokens)
    most_common_words = lang_fd.most_common(10)

    for pair in most_common_words:
      word = pair[0]
      word_features.append(word)

    # get most common endings
    endings = [tok[-4:] for tok in tokens if len(tok)>4]
    endings = [tok[-3:] for tok in tokens if len(tok)>3]
    endings = [tok[-2:] for tok in tokens if len(tok)>2]
    end_fd = nltk.FreqDist(endings)
    most_common_endings = end_fd.most_common(5)

    for pair in most_common_endings:
      word = pair[0]
      ending_features.append(word)

  # sort features and remove all duplicates
  features = sorted(list(set(word_features + ending_features)))

  # get train text file
  output_file_1 = open('train_data.txt', 'w')
  for text, label in zip(x_train, y_train):
    # tokenize text
    tokens = nltk.word_tokenize(text)
    tokens = [tok.lower() for tok in tokens if (tok.isalpha()) and (len(tok)<10)]

    endings = [tok[-4:] for tok in tokens if len(tok)>4]
    endings = [tok[-3:] for tok in tokens if len(tok)>3]
    endings = [tok[-2:] for tok in tokens if len(tok)>2]

    tokens = tokens + endings

    # iterate through feature words
    # get num occurrences and add
    feature_list = []
    for num, word in enumerate(features):
      count = tokens.count(word)
      feature_list.append('ftr' + str(num) + ':' + str(count))


    # get complete line, write to file
    feature_line = ' '.join(feature_list)
    lable_line = '__label__' + label 
    line = lable_line + ' ' + feature_line

    output_file_1.write(line + '\n')

  # get test text file
  output_file_2 = open('test_data.txt', 'w')
  for text, label in zip(x_train, y_train):
    # tokenize text
    tokens = nltk.word_tokenize(text)
    tokens = [tok.lower() for tok in tokens if (tok.isalpha()) and (len(tok)<10)]

    endings = [tok[-4:] for tok in tokens if len(tok)>4]
    endings = [tok[-3:] for tok in tokens if len(tok)>3]
    endings = [tok[-2:] for tok in tokens if len(tok)>2]

    tokens = tokens + endings

    # iterate through feature words
    # get num occurrences and add
    feature_list = []
    for num, word in enumerate(features):
      count = tokens.count(word)
      feature_list.append('ftr' + str(num) + ':' + str(count))

    # get complete line, write to file
    feature_line = ' '.join(feature_list)
    lable_line = '__label__' + label 
    line = lable_line + ' ' + feature_line

    output_file_2.write(line + '\n')

  # train model and get accurracy for test data
  model = fasttext.train_supervised('train_data.txt', epoch=20)
  answer = model.test('test_data.txt')

  return answer[1]

### 4) Let's look at the results for different language groups and families:

In [None]:
# Slavic: Russian, Ukrainian, Polish, Bulgarian, Serbian, Slovak, Macedonian, Croatian, Czech
slavic_langs = ['rus','ukr','pol','bul','bel','srp','slk','mkd','slv','hrv','ces']

# accuracy_for_langs(slavic_langs,10,x_list,y_list)
# 0.9082422586520947

# better_accuracy_for_langs(slavic_langs,10,x_list,y_list)
# 0.9518181818181818

# accuracy_for_pretrained(slavic_langs,x_list,y_list)
# 0.8696642124526536

In [None]:
# Romance: Spanish, Portuguese, French, Romanian, Italian, Catalan, Galician, Lombard, Sardinian
romance_langs = ['spa','por','fra','ron','ita','cat','glg','lmo','srd']

# accuracy_for_langs(romance_langs,10,x_list,y_list)
# 0.9388379204892966

# better_accuracy_for_langs(romance_langs,10,x_list,y_list)
# 0.9567514949242109

#accuracy_for_pretrained(romance_langs,x_list,y_list)
# 0.8684283313120997

In [None]:
# Indic: Bengali, Oriya, Bhojpuri, Maithili, Sinhala, Gujarati, Hindi, Urdu, Panjabi, Sindhi
indic_langs = ['ben','ori','bho','mai','sin','guj','hin','urd','pan','snd']

# accuracy_for_langs(indic_langs,10,x_list,y_list)
# 0.8380880880880881

# better_accuracy_for_langs(indic_langs,10,x_list,y_list)
# 0.8533533533533534

# accuracy_for_pretrained(indic_langs,x_list,y_list)
# 0.8672001362049885

In [None]:
# Germanic: English, German, Bavarian, Low German, Swedish, Danish, Afrikaans, Norwegian
germanic_langs = ['eng','deu','bar','nds','swe','dan','afr','nob']

# accuracy_for_langs(germanic_langs,10,x_list,y_list)
# 0.94078125

# better_accuracy_for_langs(germanic_langs,10,x_list,y_list)
# 0.945244055068836

# accuracy_for_pretrained(germanic_langs,x_list,y_list)
# 0.8701091466139018

In [None]:
# Uralic: Finnish, Estonian, Hungarian, Komi, Udmurt
uralic_langs = ['fin','est','hun','kom','udm']
# accuracy_for_langs(uralic_langs,10,x_list,y_list)
# 0.9121044701155199

# better_accuracy_for_langs(uralic_langs,10,x_list,y_list)
# 0.966

# accuracy_for_pretrained(uralic_langs,x_list,y_list)
# 0.8685464992551607

In [None]:
# Turkic: Turkish, Uzbek, Azerbaijani, Kazakh, Turkmen, Tatar, Kyrgyz, Bashkir, Chuvash, Karakalpak, Crimean Tatar, Tuvan
turkic_langs = ['tur','uzb','aze','kaz','tuk','tat','kir','bak','chv','kaa','crh','sah','tuv']

# accuracy_for_langs(turkic_langs,10,x_list,y_list)
# 0.7941666666666667

# better_accuracy_for_langs(turkic_langs,10,x_list,y_list)
# 0.8903125

# accuracy_for_pretrained(turkic_langs,x_list,y_list)
# 0.8680100434079496

In [None]:
# Afroasiatic: Arabic, Hausa, Oromo, Amharic, Solami, Kabyle, Afar
afroasiatic_langs = ['ara','hau','orm','amh','som','kab','aar']

# accuracy_for_langs(afroasiatic_langs,10,x_list,y_list)
# 0.9672096908939014

# better_accuracy_for_langs(afroasiatic_langs,10,x_list,y_list)
# 0.9774859287054409

# accuracy_for_pretrained(afroasiatic_langs,x_list,y_list)
# 0.8681533812827169

### 5) Now, let's combine them and see how we do:

In [None]:
west_european_list = romance_langs + germanic_langs

# accuracy_for_langs(west_european_list,10,x_list,y_list)
# 0.9284348337746396

# better_accuracy_for_langs(west_european_list,10,x_list,y_list)
# 0.9414662842856092

# accuracy_for_pretrained(west_european_list,x_list,y_list)
# 0.8694782534683803

In [None]:
east_west_european_list = romance_langs + germanic_langs + slavic_langs

# accuracy_for_langs(east_west_european_list,10,x_list,y_list)
# 0.8897767857142858

# better_accuracy_for_langs(east_west_european_list,10,x_list,y_list)
# 0.9031163496740781

# accuracy_for_pretrained(east_west_european_list,x_list,y_list)
# 0.8688325780302547

In [None]:
european_list = slavic_langs + romance_langs + germanic_langs + uralic_langs

# accuracy_for_langs(european_list,10,x_list,y_list)
# 0.8218181818181818

# better_accuracy_for_langs(european_list,10,x_list,y_list)
# 0.838939393939394

# accuracy_for_pretrained(european_list,x_list,y_list)
# 0.8719591358944344

In [None]:
indo_european_list = slavic_langs + romance_langs + germanic_langs + indic_langs

# accuracy_for_langs(indo_european_list,10,x_list,y_list)
# 0.6882894736842106

# better_accuracy_for_langs(indo_european_list,10,x_list,y_list)
# 0.5898684210526316

# accuracy_for_pretrained(indo_european_list,x_list,y_list)
# 0.8698742365884281

### 6) Lastly, combine all the languages and evaluate performance

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

# accuracy_for_langs(super_list,10,x_list,y_list)
# 0.0984671502930448

# better_accuracy_for_langs(super_list,10,x_list,y_list)
# 0.17260245901639346

#accuracy_for_pretrained(super_list,x_list,y_list)
# 0.869256145578376

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

#accuracy_for_langs(super_list,5,x_list,y_list)
# 0.5948565573770492

# better_accuracy_for_langs(super_list,5,x_list,y_list)
# 0.1794467213114754

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

# accuracy_for_langs(super_list,15,x_list,y_list)
# 0.044221311475409836

# better_accuracy_for_langs(super_list,15,x_list,y_list)
# 0.16649590163934427

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

#accuracy_for_langs(super_list,20,x_list,y_list)
# 0.08336065573770492

# better_accuracy_for_langs(super_list,20,x_list,y_list)
# 0.17413934426229508

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

#accuracy_for_langs(super_list,25,x_list,y_list)
# 0.16362704918032786

# better_accuracy_for_langs(super_list,25,x_list,y_list)
# 0.17461065573770493

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

#accuracy_for_langs(super_list,30,x_list,y_list)
# 0.043852459016393446

# better_accuracy_for_langs(super_list,30,x_list,y_list)
# 0.1737295081967213

In [None]:
super_list = slavic_langs + romance_langs + indic_langs + germanic_langs + uralic_langs + turkic_langs + afroasiatic_langs

# accuracy_for_langs(super_list,35,x_list,y_list)
# 0.07338114754098361

#better_accuracy_for_langs(super_list,35,x_list,y_list)
# 0.18