In [7]:
import pandas as pd
import pickle as pkl
import json
from random import random
import jieba
import string

In [9]:
with open('/mnt/nas_home/mrgj4/marvl-code/data/en/annotations/train.jsonl', 'r', encoding="utf-8") as json_file:
    json_list = list(json_file)

nlvr2_data = [json.loads(jline) for jline in json_list]

with open("/mnt/nas_home/mrgj4/wit_dataset/infusion_exp/en_caption_to_pos.pickle",'rb') as handle_1:
    nlvr2_sentence_to_pos = pkl.load(handle_1)


In [10]:
nb_nlvr2_words = 0

for entry in nlvr2_data:
    sentence = entry["sentence"]
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words = sentence.split(" ")
    nb_nlvr2_words += len(words)

print(nb_nlvr2_words)

1161773


In [28]:
def check_nlvr2_coverage(en_lang_dict, valid_pos_tag = set(["ADV", "ADJ", "NOUN", "VERB"])):
    nb_changed = 0
    entries_code_switched = set ()

    for entry in nlvr2_data:

        sentence = entry["sentence"]

        if "\n" in sentence:
            sentence = sentence.replace("\n", " ")

        if sentence in nlvr2_sentence_to_pos:
            pos_tokens = nlvr2_sentence_to_pos[sentence]
        else:
            continue

        init_sentence = str(sentence)

        for init_word, pos in pos_tokens:
            if pos in valid_pos_tag:
                word = init_word.lower()
                dict_pair = (word, pos)
                if dict_pair in en_lang_dict:
                    entries_code_switched.add(dict_pair)
                    nb_changed += 1
                    if random() < 0.5:
                        new_word = en_lang_dict[dict_pair]
                        if init_word[0].isupper():
                            new_word = new_word.capitalize()
                        sentence = sentence.replace(init_word, new_word)
        
    return nb_changed, entries_code_switched

In [30]:
def code_switch_word(init_word, caption, dict_pair, dictionary):
    if random() < 0.5:
        new_word = dictionary[dict_pair]
        if init_word[0].isupper():
            new_word = new_word.capitalize()
        caption = caption.replace(init_word, new_word)
    return caption

In [31]:
def check_WIT_captions_coverage(captions, lang_en_dict, lang_caption_to_pos, lang_word_to_reduced_version=None, valid_pos_tag = set(["ADV", "ADJ", "NOUN", "VERB"])):

    nb_changed = 0
    nb_changed_after_reduction = 0
    entries_code_switched = set ()

    for key, caption in captions.items():

        if "\n" in caption:
            caption = caption.replace("\n", " ")

        if caption in lang_caption_to_pos:
            pos_tokens = lang_caption_to_pos[caption]


            for init_word, pos in pos_tokens:
                if pos in valid_pos_tag:
                    word = init_word.lower()
                    dict_pair = (word, pos)
                    if dict_pair in lang_en_dict:
                        nb_changed += 1
                        entries_code_switched.add(dict_pair)
                        if random() < 0.5:
                            new_word = lang_en_dict[dict_pair]
                            if init_word[0].isupper():
                                new_word = new_word.capitalize()
                            caption = caption.replace(init_word, new_word)

                    elif lang_word_to_reduced_version:
                        if word in lang_word_to_reduced_version:
                            reduced_word = lang_word_to_reduced_version[word]

                            dict_pair = (reduced_word, pos)
                            if dict_pair in lang_en_dict:
                                entries_code_switched.add(dict_pair)
                                nb_changed += 1
                                nb_changed_after_reduction +=1
                                if random() < 0.5:
                                    new_word = lang_en_dict[dict_pair]
                                    if init_word[0].isupper():
                                        new_word = new_word.capitalize()
                                    caption = caption.replace(init_word, new_word)

    return nb_changed, nb_changed_after_reduction, entries_code_switched

In [32]:
def count_nb_words(captions):
    nb_words = 0
    for key, caption in captions.items():
        caption = caption.translate(str.maketrans('', '', string.punctuation))
        words = caption.split(" ")
        nb_words += len(words)
    return nb_words

In [33]:
def get_nb_noncomposite_words(dictionary):
    non_composite_word = 0
    for key in dictionary.keys():
        if " " not in key[0]:
           non_composite_word += 1
    return non_composite_word

### SW

In [16]:
with open('/mnt/nas_home/mrgj4/wit_dataset/captions/sw_captions.json', 'r', encoding="utf-8") as json_file:
    sw_captions = json.load(json_file)

with open("/mnt/nas_home/mrgj4/wit_dataset/infusion_exp/sw_caption_to_pos.pickle",'rb') as handle_1:
    sw_caption_to_pos = pkl.load(handle_1)

with open("./panlex_dictionaries/sw_to_en_dict.pickle",'rb') as handle_2:
    sw_to_en_dict = pkl.load(handle_2)

with open("./panlex_dictionaries/en_to_sw_dict.pickle",'rb') as handle_3:
    en_to_sw_dict = pkl.load(handle_3)

SW -> EN (WIT captions)

In [37]:
non_composite_words_sw_en = get_nb_noncomposite_words(sw_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_words_sw_en))

There are 8347 non-composite words in the dictionary.


In [29]:
nb_changed_sw_captions, nb_changed_after_reduction_sw, entries_code_switched_sw = check_WIT_captions_coverage(sw_captions, 
                                                                                    sw_to_en_dict, 
                                                                                    sw_caption_to_pos)

In [38]:
print("There are {n} distinct Swahili words that were code-switched.".format(n = len(entries_code_switched_sw)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_sw) / non_composite_words_sw_en * 100))

There are 2004 distinct Swahili words that were code-switched.
It corresponds to 24.008625853600098% of the total dictionary words


In [12]:
nb_words_sw_captions = count_nb_words(sw_captions)

print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_sw_captions, n_2 = nb_words_sw_captions))
print("{n} % of Swahili words can be code-switched to English.".format(n = nb_changed_sw_captions / nb_words_sw_captions * 100))

# 24733 words changed | # 128350 total words.
19.26996493961823 % of Swahili words can be code-switched to English.


EN -> SW (NLVR2 sentences)

In [40]:
non_composite_word_en_sw = get_nb_noncomposite_words(en_to_sw_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_sw))

There are 12839 non-composite words in the dictionary.


In [31]:
nb_changed_to_sw, entries_code_switched_sw_nlvr2 = check_nlvr2_coverage(en_to_sw_dict)

In [41]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_sw_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_sw_nlvr2) / non_composite_word_en_sw * 100))

There are 2158 distinct English words that were code-switched.
It corresponds to 16.808162629488276% of the total dictionary words


In [42]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_sw, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Swahili.".format(n = nb_changed_to_sw / nb_nlvr2_words * 100))

# 331843 words changed | # 1161773 total words.
28.563497344145546 % of English words can be code-switched to Swahili.


### TA

In [43]:
with open('/mnt/nas_home/mrgj4/wit_dataset/captions/ta_captions.json', 'r', encoding="utf-8") as json_file:
    ta_captions = json.load(json_file)

with open("/mnt/nas_home/mrgj4/wit_dataset/infusion_exp/ta_caption_to_pos.pickle",'rb') as handle_1:
    ta_caption_to_pos = pkl.load(handle_1)

with open("./panlex_dictionaries/ta_to_en_dict.pickle",'rb') as handle_2:
    ta_to_en_dict = pkl.load(handle_2)

with open("./panlex_dictionaries/en_to_ta_dict.pickle",'rb') as handle_3:
    en_to_ta_dict = pkl.load(handle_3)

with open("./panlex_dictionaries/ta_word_to_reduced_version.pickle",'rb') as handle_4:
    ta_word_to_reduced_version = pkl.load(handle_4)

TA -> EN (WIT captions)

In [44]:
non_composite_word_ta_en = get_nb_noncomposite_words(ta_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_ta_en))

There are 8889 non-composite words in the dictionary.


In [45]:
nb_changed_ta_captions, nb_changed_after_reduction_ta, entries_code_switched_ta = check_WIT_captions_coverage(ta_captions, 
                                                                                    ta_to_en_dict, 
                                                                                    ta_caption_to_pos,
                                                                                    ta_word_to_reduced_version
                                                                                    )

In [46]:
print("There are {n} distinct Tamil words that were code-switched.".format(n = len(entries_code_switched_ta)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_ta) / non_composite_word_ta_en * 100))

There are 2635 distinct Tamil words that were code-switched.
It corresponds to 29.64337945775678% of the total dictionary words


In [48]:
nb_words_ta_captions = count_nb_words(ta_captions)

print("# {n_1} words changed | # {n_2} words changed after reduction | # {n_3} total words.".format(n_1 = nb_changed_ta_captions, 
                                                                                                   n_2 = nb_changed_after_reduction_ta,
                                                                                                   n_3 = nb_words_ta_captions))
print("{n} % of Tamil words can be code-switched to English.".format(n = nb_changed_ta_captions / nb_words_ta_captions * 100))

# 42885 words changed | # 4890 words changed after reduction | # 285031 total words.
15.045731867761752 % of Tamil words can be code-switched to English.


EN -> TA (NLVR2 sentences)

In [50]:
non_composite_word_en_ta = get_nb_noncomposite_words(en_to_ta_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_ta))

There are 14615 non-composite words in the dictionary.


In [51]:
nb_changed_to_ta, entries_code_switched_ta_nlvr2 = check_nlvr2_coverage(en_to_ta_dict)

In [52]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_ta_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_ta_nlvr2) / non_composite_word_en_ta * 100))

There are 2236 distinct English words that were code-switched.
It corresponds to 15.299349982894286% of the total dictionary words


In [63]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_ta, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Tamil.".format(n = nb_changed_to_ta / nb_nlvr2_words * 100))

# 339761 words changed | # 1161773 total words.
29.24504184552404 % of English words can be code-switched to Tamil.


### TR

In [17]:
with open('/mnt/nas_home/mrgj4/wit_dataset/captions/tr_captions.json', 'r', encoding="utf-8") as json_file:
    tr_captions = json.load(json_file)

with open("/mnt/nas_home/mrgj4/wit_dataset/infusion_exp/tr_caption_to_pos.pickle",'rb') as handle_1:
    tr_caption_to_pos = pkl.load(handle_1)

with open("./panlex_dictionaries/tr_to_en_dict.pickle",'rb') as handle_2:
    tr_to_en_dict = pkl.load(handle_2)

with open("./panlex_dictionaries/en_to_tr_dict.pickle",'rb') as handle_3:
    en_to_tr_dict = pkl.load(handle_3)

with open("./panlex_dictionaries/tr_word_to_reduced_version.pickle",'rb') as handle_4:
    tr_word_to_reduced_version = pkl.load(handle_4)

TR -> EN (WIT captions)

In [18]:
non_composite_word_tr_en = get_nb_noncomposite_words(tr_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_tr_en))

There are 13649 non-composite words in the dictionary.


In [19]:
nb_changed_tr_captions, nb_changed_after_reduction_tr, entries_code_switched_tr = check_WIT_captions_coverage(tr_captions, 
                                                                                    tr_to_en_dict, 
                                                                                    tr_caption_to_pos,
                                                                                    tr_word_to_reduced_version
                                                                                    )

In [20]:
print("There are {n} distinct Turkish words that were code-switched.".format(n = len(entries_code_switched_tr)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr) / non_composite_word_tr_en * 100))

There are 6597 distinct Turkish words that were code-switched.
It corresponds to 48.33321122426552% of the total dictionary words


In [21]:
nb_words_tr_captions = count_nb_words(tr_captions)

print("# {n_1} words changed | # {n_2} words changed after reduction | # {n_3} total words.".format(n_1 = nb_changed_tr_captions, 
                                                                                                   n_2 = nb_changed_after_reduction_tr,
                                                                                                   n_3 = nb_words_tr_captions))
print("{n} % of Turkish words can be code-switched to English.".format(n = nb_changed_tr_captions / nb_words_tr_captions * 100))

# 235050 words changed | # 148810 words changed after reduction | # 766292 total words.
30.673685749035617 % of Turkish words can be code-switched to English.


EN -> TR (NLVR2 sentences)

In [22]:
non_composite_word_en_tr = get_nb_noncomposite_words(en_to_tr_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_tr))

There are 22691 non-composite words in the dictionary.


In [23]:
nb_changed_to_tr, entries_code_switched_tr_nlvr2 = check_nlvr2_coverage(en_to_tr_dict)

In [24]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_tr_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr_nlvr2) / non_composite_word_en_tr * 100))

There are 2572 distinct English words that were code-switched.
It corresponds to 11.334890485214402% of the total dictionary words


In [25]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_tr, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Turkish.".format(n = nb_changed_to_tr / nb_nlvr2_words * 100))

# 349236 words changed | # 1161773 total words.
30.060605643271103 % of English words can be code-switched to Turkish.


### ID

In [65]:
with open('/mnt/nas_home/mrgj4/wit_dataset/captions/id_captions.json', 'r', encoding="utf-8") as json_file:
    id_captions = json.load(json_file)

with open("/mnt/nas_home/mrgj4/wit_dataset/infusion_exp/id_caption_to_pos.pickle",'rb') as handle_1:
    id_caption_to_pos = pkl.load(handle_1)

with open("./panlex_dictionaries/id_to_en_dict.pickle",'rb') as handle_2:
    id_to_en_dict = pkl.load(handle_2)

with open("./panlex_dictionaries/en_to_id_dict.pickle",'rb') as handle_3:
    en_to_id_dict = pkl.load(handle_3)

ID -> EN (WIT captions)

In [66]:
non_composite_word_id_en = get_nb_noncomposite_words(id_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_id_en))

There are 11251 non-composite words in the dictionary.


In [67]:
nb_changed_id_captions, nb_changed_after_reduction_id, entries_code_switched_id = check_WIT_captions_coverage(id_captions, 
                                                                                    id_to_en_dict,
                                                                                    id_caption_to_pos)

In [81]:
print("There are {n} distinct Indonesian words that were code-switched.".format(n = len(entries_code_switched_id)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_id) / non_composite_word_id_en * 100))

There are 5787 distinct Indonesian words that were code-switched.
It corresponds to 51.435427961958936% of the total dictionary words


In [69]:
nb_words_id_captions = count_nb_words(id_captions)

print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_id_captions, n_2 = nb_words_id_captions))
print("{n} % of Indonesian words can be code-switched to English.".format(n = nb_changed_id_captions / nb_words_id_captions * 100))

# 215227 words changed | # 1053265 total words.
20.434268678822516 % of Indonesian words can be code-switched to English.


EN -> ID (NLVR2 sentences)

In [70]:
non_composite_word_en_id = get_nb_noncomposite_words(en_to_id_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_id))

There are 20574 non-composite words in the dictionary.


In [71]:
nb_changed_to_id, entries_code_switched_id_nlvr2 = check_nlvr2_coverage(en_to_id_dict)

In [72]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_id_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_id_nlvr2) / non_composite_word_en_id * 100))

There are 2542 distinct English words that were code-switched.
It corresponds to 12.355400019442014% of the total dictionary words


In [78]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_id, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Indonesian.".format(n = nb_changed_to_id / nb_nlvr2_words * 100))

# 345980 words changed | # 1161773 total words.
29.780344352984617 % of English words can be code-switched to Indonesian.


### ZH

In [73]:
with open('/mnt/nas_home/mrgj4/wit_dataset/captions/zh_captions.json', 'r', encoding="utf-8") as json_file:
    zh_captions = json.load(json_file)

with open("/mnt/nas_home/mrgj4/wit_dataset/infusion_exp/zh_caption_to_pos.pickle",'rb') as handle_1:
    zh_caption_to_pos = pkl.load(handle_1)

with open("./panlex_dictionaries/zh_to_en_dict.pickle",'rb') as handle_2:
    zh_to_en_dict = pkl.load(handle_2)

with open("./panlex_dictionaries/en_to_zh_dict.pickle",'rb') as handle_3:
    en_to_zh_dict = pkl.load(handle_3)

ZH -> EN (WIT captions)

In [74]:
non_composite_word_zh_en = get_nb_noncomposite_words(zh_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_zh_en))

There are 29637 non-composite words in the dictionary.


In [75]:
nb_changed_zh_captions, nb_changed_after_reduction_zh, entries_code_switched_zh = check_WIT_captions_coverage(zh_captions, 
                                                                                    zh_to_en_dict, 
                                                                                    zh_caption_to_pos)

In [76]:
print("There are {n} distinct Mandarin Chinese words that were code-switched.".format(n = len(entries_code_switched_zh)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_zh) / non_composite_word_zh_en * 100))

There are 8597 distinct Mandarin Chinese words that were code-switched.
It corresponds to 29.007659344737995% of the total dictionary words


In [77]:
nb_words_zh_captions = 0
for key, caption in zh_captions.items():
    words = jieba.lcut(caption)
    nb_words_zh_captions += len(words)


print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_zh_captions, n_2 = nb_words_zh_captions))
print("{n} % of Chinese words can be code-switched to English.".format(n = nb_changed_zh_captions / nb_words_zh_captions * 100))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.997 seconds.
Prefix dict has been built successfully.


# 268467 words changed | # 1213778 total words.
22.118295108331175 % of Chinese words can be code-switched to English.


EN -> ZH (NLVR2 sentences)

In [78]:
non_composite_word_en_zh = get_nb_noncomposite_words(en_to_zh_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_zh))

There are 28586 non-composite words in the dictionary.


In [79]:
nb_changed_to_zh, entries_code_switched_zh_nlvr2 = check_nlvr2_coverage(en_to_zh_dict)

In [80]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_zh_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_zh_nlvr2) / non_composite_word_en_zh * 100))

There are 2701 distinct English words that were code-switched.
It corresponds to 9.448681172601974% of the total dictionary words


In [84]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_zh, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Chinese.".format(n = nb_changed_to_zh / nb_nlvr2_words * 100))

# 354989 words changed | # 1161773 total words.
30.555797044689452 % of English words can be code-switched to Chinese.


### TR - NOUNs only

TR -> EN (WIT captions)

In [34]:
non_composite_word_tr_en = get_nb_noncomposite_words(tr_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_tr_en))

There are 13649 non-composite words in the dictionary.


In [35]:
nb_changed_tr_captions, nb_changed_after_reduction_tr, entries_code_switched_tr = check_WIT_captions_coverage(tr_captions, 
                                                                                    tr_to_en_dict, 
                                                                                    tr_caption_to_pos,
                                                                                    tr_word_to_reduced_version,
                                                                                    valid_pos_tag = set(["NOUN"])
                                                                                    )

In [36]:
print("There are {n} distinct Turkish words that were code-switched.".format(n = len(entries_code_switched_tr)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr) / non_composite_word_tr_en * 100))

There are 4733 distinct Turkish words that were code-switched.
It corresponds to 34.67653307934647% of the total dictionary words


In [37]:
nb_words_tr_captions = count_nb_words(tr_captions)

print("# {n_1} words changed | # {n_2} words changed after reduction | # {n_3} total words.".format(n_1 = nb_changed_tr_captions, 
                                                                                                   n_2 = nb_changed_after_reduction_tr,
                                                                                                   n_3 = nb_words_tr_captions))
print("{n} % of Turkish words can be code-switched to English.".format(n = nb_changed_tr_captions / nb_words_tr_captions * 100))

# 175836 words changed | # 118351 words changed after reduction | # 766292 total words.
22.946344213432997 % of Turkish words can be code-switched to English.


EN -> TR (NLVR2 sentences)

In [38]:
non_composite_word_en_tr = get_nb_noncomposite_words(en_to_tr_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_tr))

There are 22691 non-composite words in the dictionary.


In [40]:
nb_changed_to_tr, entries_code_switched_tr_nlvr2 = check_nlvr2_coverage(en_to_tr_dict, valid_pos_tag = set(["NOUN"]))

In [41]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_tr_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr_nlvr2) / non_composite_word_en_tr * 100))

There are 1753 distinct English words that were code-switched.
It corresponds to 7.725529945793487% of the total dictionary words


In [42]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_tr, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Turkish.".format(n = nb_changed_to_tr / nb_nlvr2_words * 100))

# 229177 words changed | # 1161773 total words.
19.726487015966114 % of English words can be code-switched to Turkish.


### TR - VERBs only

TR -> EN (WIT captions)

In [43]:
non_composite_word_tr_en = get_nb_noncomposite_words(tr_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_tr_en))

There are 13649 non-composite words in the dictionary.


In [44]:
nb_changed_tr_captions, nb_changed_after_reduction_tr, entries_code_switched_tr = check_WIT_captions_coverage(tr_captions, 
                                                                                    tr_to_en_dict, 
                                                                                    tr_caption_to_pos,
                                                                                    tr_word_to_reduced_version,
                                                                                    valid_pos_tag = set(["VERB"])
                                                                                    )

In [45]:
print("There are {n} distinct Turkish words that were code-switched.".format(n = len(entries_code_switched_tr)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr) / non_composite_word_tr_en * 100))

There are 845 distinct Turkish words that were code-switched.
It corresponds to 6.190929738442376% of the total dictionary words


In [46]:
nb_words_tr_captions = count_nb_words(tr_captions)

print("# {n_1} words changed | # {n_2} words changed after reduction | # {n_3} total words.".format(n_1 = nb_changed_tr_captions, 
                                                                                                   n_2 = nb_changed_after_reduction_tr,
                                                                                                   n_3 = nb_words_tr_captions))
print("{n} % of Turkish words can be code-switched to English.".format(n = nb_changed_tr_captions / nb_words_tr_captions * 100))

# 29501 words changed | # 26576 words changed after reduction | # 766292 total words.
3.849837920792596 % of Turkish words can be code-switched to English.


EN -> TR (NLVR2 sentences)

In [47]:
non_composite_word_en_tr = get_nb_noncomposite_words(en_to_tr_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_tr))

There are 22691 non-composite words in the dictionary.


In [49]:
nb_changed_to_tr, entries_code_switched_tr_nlvr2 = check_nlvr2_coverage(en_to_tr_dict, valid_pos_tag = set(["VERB"]))

In [50]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_tr_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr_nlvr2) / non_composite_word_en_tr * 100))

There are 207 distinct English words that were code-switched.
It corresponds to 0.9122559605129787% of the total dictionary words


In [51]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_tr, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Turkish.".format(n = nb_changed_to_tr / nb_nlvr2_words * 100))

# 7691 words changed | # 1161773 total words.
0.6620054003665088 % of English words can be code-switched to Turkish.


### TR - NOUNs + VERBs only

TR -> EN (WIT captions)

In [52]:
non_composite_word_tr_en = get_nb_noncomposite_words(tr_to_en_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_tr_en))

There are 13649 non-composite words in the dictionary.


In [53]:
nb_changed_tr_captions, nb_changed_after_reduction_tr, entries_code_switched_tr = check_WIT_captions_coverage(tr_captions, 
                                                                                    tr_to_en_dict, 
                                                                                    tr_caption_to_pos,
                                                                                    tr_word_to_reduced_version,
                                                                                    valid_pos_tag = set(["NOUN", "VERB"])
                                                                                    )

In [54]:
print("There are {n} distinct Turkish words that were code-switched.".format(n = len(entries_code_switched_tr)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr) / non_composite_word_tr_en * 100))

There are 5578 distinct Turkish words that were code-switched.
It corresponds to 40.86746281778885% of the total dictionary words


In [55]:
nb_words_tr_captions = count_nb_words(tr_captions)

print("# {n_1} words changed | # {n_2} words changed after reduction | # {n_3} total words.".format(n_1 = nb_changed_tr_captions, 
                                                                                                   n_2 = nb_changed_after_reduction_tr,
                                                                                                   n_3 = nb_words_tr_captions))
print("{n} % of Turkish words can be code-switched to English.".format(n = nb_changed_tr_captions / nb_words_tr_captions * 100))

# 205337 words changed | # 144927 words changed after reduction | # 766292 total words.
26.796182134225592 % of Turkish words can be code-switched to English.


EN -> TR (NLVR2 sentences)

In [56]:
non_composite_word_en_tr = get_nb_noncomposite_words(en_to_tr_dict)
print("There are {n} non-composite words in the dictionary.".format(n = non_composite_word_en_tr))

There are 22691 non-composite words in the dictionary.


In [57]:
nb_changed_to_tr, entries_code_switched_tr_nlvr2 = check_nlvr2_coverage(en_to_tr_dict, valid_pos_tag = set(["NOUN", "VERB"]))

In [58]:
print("There are {n} distinct English words that were code-switched.".format(n = len(entries_code_switched_tr_nlvr2)))
print("It corresponds to {x}% of the total dictionary words".format(x = len(entries_code_switched_tr_nlvr2) / non_composite_word_en_tr * 100))

There are 1960 distinct English words that were code-switched.
It corresponds to 8.637785906306465% of the total dictionary words


In [59]:
print("# {n_1} words changed | # {n_2} total words.".format(n_1 = nb_changed_to_tr, n_2 = nb_nlvr2_words))
print("{n} % of English words can be code-switched to Turkish.".format(n = nb_changed_to_tr / nb_nlvr2_words * 100))

# 236868 words changed | # 1161773 total words.
20.388492416332625 % of English words can be code-switched to Turkish.
