In [1]:
%load_ext autoreload
%autoreload 2

import os
import nltk
import joblib
import stanza
import transformers
import numpy as np
import genanki

from datetime import datetime
from lang import prompts
from gtts import gTTS
from tqdm.notebook import tqdm
from lang.language import Language
from lang.person import Person
from lang import utils
from lang import statistics
from huggingface_hub import login

from string import Template

# ------------------------------------------

token = "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"
login(token=token)

data_dir = "lang/data"

use_lang = "hindi"
compute_for = "sentences"

detailed = False

nltk.download('punkt')

# ------------------------------------------

LLM = None

if use_lang == "hindi":
    stanza.download('hi')
    
elif use_lang == "farsi":
    stanza.download('fa')
    
else: # german
    stanza.download('de')

farsi_nouns = [
    ("کتاب", "Book"),
    ("خانه", "House"),
    ("درخت", "Tree"),
    ("آب", "Water"),
    ("خورشید", "Sun"),
    ("ماه", "Moon"),
    ("زمین", "Earth"),
    ("دریا", "Sea"),
    ("آسمان", "Sky"),
    ("گل", "Flower"),
    ("ماشین", "Car"),
    ("پنجره", "Window"),
    ("در", "Door"),
    ("صندلی", "Chair"),
    ("میز", "Table"),
    ("کامپیوتر", "Computer"),
    ("تلویزیون", "Television"),
    ("موبایل", "Mobile phone"),
    ("کفش", "Shoe"),
    ("لباس", "Clothes"),
    ("کیف", "Bag"),
    ("خیابان", "Street"),
    ("پارک", "Park"),
    ("پل", "Bridge"),
    ("کتابخانه", "Library"),
    ("مدرسه", "School"),
    ("دانشگاه", "University"),
    ("بیمارستان", "Hospital"),
    ("غذا", "Food"),
    ("سبزیجات", "Vegetables"),
    ("میوه", "Fruit"),
    ("گوشت", "Meat"),
    ("ماهی", "Fish"),
    ("نان", "Bread"),
    ("شیر", "Milk"),
    ("قهوه", "Coffee"),
    ("چای", "Tea"),
    ("شکر", "Sugar"),
    ("نمک", "Salt"),
    ("هواپیما", "Airplane"),
    ("قطار", "Train"),
    ("اتوبوس", "Bus"),
    ("دوچرخه", "Bicycle"),
    ("بچه", "Child"),
    ("مرد", "Man"),
    ("زن", "Woman"),
    ("دوست", "Friend"),
    ("خانواده", "Family"),
    ("پدر", "Father"),
    ("مادر", "Mother"),
    ("برادر", "Brother"),
    ("خواهر", "Sister"),
    ("سگ", "Dog"),
    ("گربه", "Cat"),
    ("پرنده", "Bird"),
    ("خرگوش", "Rabbit"),
    ("اسب", "Horse"),
    ("گاو", "Cow"),
    ("گوسفند", "Sheep"),
    ("خوک", "Pig"),
    ("مرغ", "Chicken"),
    ("خروس", "Rooster"),
    ("اردک", "Duck"),
    ("قورباغه", "Frog"),
    ("مورچه", "Ant"),
    ("زنبور", "Bee"),
    ("پروانه", "Butterfly"),
    ("مار", "Snake"),
    ("لاکپشت", "Turtle"),
    ("هوا", "Air"),
    ("خاک", "Soil"),
    ("آتش", "Fire"),
    ("باد", "Wind"),
    ("باران", "Rain"),
    ("برف", "Snow"),
    ("رعد", "Thunder"),
    ("برق", "Lightning"),
    ("روز", "Day"),
    ("شب", "Night"),
    ("هفته", "Week"),
    ("ماه", "Month"),
    ("سال", "Year"),
    ("ساعت", "Hour"),
    ("دقیقه", "Minute"),
    ("ثانیه", "Second"),
    ("زمان", "Time"),
    ("عشق", "Love"),
    ("دوستی", "Friendship"),
    ("شادی", "Happiness"),
    ("غم", "Sadness"),
    ("ترس", "Fear"),
    ("خشم", "Anger"),
    ("سلام", "Hello"),
    ("خداحافظ", "Goodbye"),
    ("بله", "Yes"),
    ("نه", "No"),
    ("لطفا", "Please"),
    ("متشکرم", "Thank you"),
    ("ببخشید", "Sorry")
]


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/heilts/.cache/huggingface/token
Login successful


[nltk_data] Downloading package punkt to /home/heilts/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-07-04 09:17:23 INFO: Downloaded file to /home/heilts/stanza_resources/resources.json
2024-07-04 09:17:23 INFO: Downloading default packages for language: hi (Hindi) ...
2024-07-04 09:17:24 INFO: File exists: /home/heilts/stanza_resources/hi/default.zip
2024-07-04 09:17:28 INFO: Finished downloading models and saved to /home/heilts/stanza_resources


In [2]:
use_stored_language_obj = True
    
lang_path = os.path.join(data_dir, "results", f"language_{use_lang}")

if use_stored_language_obj and os.path.exists(lang_path):
    language = joblib.load(lang_path)
else:
    if use_lang == "farsi":
        nouns = farsi_nouns
    else:
        nouns = None
    
    language = Language(data_dir, use_lang, nouns=nouns)
    joblib.dump(language, lang_path)

_______________________
#### Select sentences/words

In [3]:
if compute_for == "words":
    params = [
        {"repetition_threshold":1, "word_length_eps":1.0, 'letter_scores_eps':1.0},
    ]
else: 
    params = [
        {"repetition_threshold": 10, "sentence_lengths_eps": 0.0, "compute_rating":"addition"},
    ]

In [4]:
result_path = os.path.join(data_dir, "results")

if compute_for == "words":
    num_new_items_to_select = len(language.words_for_letters)
else:
    num_new_items_to_select = 0
    
load_existing = True        # check if it still adds words even though theyre already present

type = compute_for

detailed_str = "_detailed" if detailed else ""

for param in params:
    
    path = os.path.join(result_path, f"{use_lang}_{compute_for[:-1]}_data_{utils.dict_to_str(param)}{detailed_str}")
    person = Person(language)

    if load_existing:
        if os.path.isfile(path):
            person.load_data(path, type) 
            print(f"Loading existing data. Num items already selected: {len(person.get_data(type).selected)}")
        else:
            print(f"Data file in path {path} was not found. Creating new results")          
            person.create_data(type, language, param)
            
    else:
        person.create_data(type, language, param)
        
    for idx in tqdm(range(0, num_new_items_to_select)):
        
        output = person.choose_next(type)
        
        if output is None:
            break
            
        if idx < 10:
            print(output)

        if idx % 20 == 0:
            joblib.dump(person.get_data(type), path)

    joblib.dump(person.get_data(type), path)
    
    
"""
'टॉम नहीं है।', ["It's not Tom.", "It isn't Tom.", "Tom isn't there."], 9376)
('मैं भारत में हूँ।', ['I am in India.', "I'm in India."], 7265)
('वह तुम हो।', ['That is you.'], 8208)
('आप ईरान के हैं।', ["You're from Iran."], 5618)
('मुझे यह चाहिए था।', ['I wanted this.'], 6726)
('क्या?', ['What?'], 4622)
('और जोर से।', ['Louder.'], 9850)
('टॉम ने एक गलती की।', ['Tom made a mistake.'], 9744)
('उसने तस्वीर को देखा।', ['She looked at the picture.'], 2261)
('जल्दी कर।', ['Please hurry.'], 10297)
"""


Loading existing data. Num items already selected: 1000


0it [00:00, ?it/s]

'\n\'टॉम नहीं है।\', ["It\'s not Tom.", "It isn\'t Tom.", "Tom isn\'t there."], 9376)\n(\'मैं भारत में हूँ।\', [\'I am in India.\', "I\'m in India."], 7265)\n(\'वह तुम हो।\', [\'That is you.\'], 8208)\n(\'आप ईरान के हैं।\', ["You\'re from Iran."], 5618)\n(\'मुझे यह चाहिए था।\', [\'I wanted this.\'], 6726)\n(\'क्या?\', [\'What?\'], 4622)\n(\'और जोर से।\', [\'Louder.\'], 9850)\n(\'टॉम ने एक गलती की।\', [\'Tom made a mistake.\'], 9744)\n(\'उसने तस्वीर को देखा।\', [\'She looked at the picture.\'], 2261)\n(\'जल्दी कर।\', [\'Please hurry.\'], 10297)\n'


#### Generate Explanations

In [5]:
if compute_for == "words":

    task = f"""
    You will be given a word in the {use_lang} language along with a translations in english. 
    I want you to act as an expert linguist and explain every letter of the word of the {use_lang} word in terms of grammatics, 
    how, the sound of the letter, the romanization and how adjacent letters combine to form the letters in the word.   
    Please make sure you're using exactly the given {use_lang} word and refer to the translation."""

    prompt_template = Template(\
        "<s>[INST]$task $examples[/INST]</s>" + \
        "[INST]\n Word: $input \n Translation: $translation \n[/INST]"
    )
    
else:
    task = f"""You will be given a sentence in the {use_lang} language along with one or multiple possible translations in english. 
    I want you to act as an expert linguist and explain every word of the {use_lang} sentence in terms of grammatics.  
    Please make sure you're using exactly the given {use_lang} sentence and refer to one or all of the translations given to you."""

    prompt_template = Template(\
        "<s>[INST]$task $examples[/INST]</s>" + \
        "[INST]\n Sentence: $input \n Translation: $translation \n[/INST]"
    )

In [6]:
generate_explanations = True

if generate_explanations:
    if ("LLM" in locals() and LLM is None) or "LLM" not in locals():
        LLM = utils.load_LLM(token=token)

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
recompute = False

if generate_explanations:
    type_ = compute_for[:-1]
    
    detailed_str = "_detailed" if detailed else ""
    
    stop_after_n = None

    login(token=token)
    transformers.logging.set_verbosity_error()
    lang_path = os.path.join(data_dir, "results", f"language_{use_lang}")

    for param in params:

        person_save_path = os.path.join(result_path, f"{use_lang}_{type_}_data_" + utils.dict_to_str(param) + detailed_str)
        
        person_data = joblib.load(person_save_path)

        examples = prompts.get_examples_for_prompt(type_, use_lang, detailed=detailed)

        if compute_for == "sentences":
            if detailed:
                num_tokens_per_word = 150
            else:
                num_tokens_per_word = 75
        else:
            if detailed:
                num_tokens_per_word = 1000
            else:
                num_tokens_per_word = 500
            
        utils.generate_explanation(person_data, person_save_path, LLM, prompt_template, task, examples, lang_path, language, 
                                type_, recompute_if_exists=recompute, stop_after_n=stop_after_n, num_tokens_per_word=num_tokens_per_word,
                                print_output=False)
# 1750

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/heilts/.cache/huggingface/token
Login successful


  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
if "LLM" in locals() and LLM is not None:
    del LLM
    utils.flush()

### Create Sound Files

In [9]:

use_sound = use_lang != "farsi" 

create_sound_files = True

if create_sound_files and use_sound:
    detailed_str = "_detailed" if detailed else ""
    
    date = datetime.today().strftime('%y_%m_%d')
    
    sound_file_path = os.path.join(data_dir, "results", "sound_files")
    base_str = f"{use_lang}_{compute_for[:-1]}_data_" 

    type_ = compute_for[:-1]

    for param in params:

        path = os.path.join(result_path, base_str + utils.dict_to_str(param) + detailed_str)
        person = Person(language)
        person.load_data(path, type)
        
        # remove existing mp3s
        utils.delete_mp3_files(sound_file_path)

        data = person.get_data(compute_for)
        
        if use_lang == "german":
            lang_ = "de"
        else:
            lang_ = use_lang[:2]
        
        for idx, item in enumerate(data.selected):
            
            fn = f"{use_lang}_{compute_for}_{date}_{idx}{detailed_str}.mp3"
            
            text = item[type_]
            
            if not os.path.exists(os.path.join(sound_file_path, fn)):
                # pass
                myobj = gTTS(text=text, lang=lang_, slow=False)
                myobj.save(os.path.join(sound_file_path, fn))
            # else:
                # print("already exists?", os.path.join(sound_file_path, fn))

### Create Anki Deck

In [10]:
import genanki

font_size = 20 

if use_lang == "farsi":
    font_size = 40


if use_sound: 
    template = {
            'name': 'Card 1',
            'qfmt': '''
            
                <div style="background-color: white; padding: 20px; text-align: center;">
                    <div style="font-family: Arial; font-size: 20px; color: black;">
                        {{Question}}
                    </div>
                    <br>
                    {{Sound1}}
                </div>
            ''',
            'afmt': '''
                <div style="background-color: white; padding: 20px;">
                    <div style="font-family: Arial; font-size: 20px; color: black;">
                        {{FrontSide}}
                    </div>
                    <hr id="answer">
                    <div style="font-family: Arial; font-size: 18px; color: black; text-align: center;">
                        {{Answer}}
                    </div>
                    
                    <div style="text-align:center;">
                        {{Sound2}}
                    </div>
                    
                    <br>
                    <hr id="divider">
                    <div style="font-family: Arial; font-size: 16px; color: black;">
                        Explanation: {{Explanation}}
                    </div>
                </div>
            '''
        }

    fields = [
        {'name': 'Question'},
        {'name': 'Answer'},
        {'name': 'Explanation'},  
        {'name': 'Sound1'},
        {'name': 'Sound2'}
    ]
else:
    template = {
            'name': 'Card 1',
            'qfmt': '''
            
                <div style="background-color: white; padding: 20px; text-align: center;">
                    <div style="font-family: Arial; font-size: 20px; color: black;">
                        {{Question}}
                    </div>
                    <br>
                </div>
            ''',
            'afmt': '''
                <div style="background-color: white; padding: 20px;">
                    <div style="font-family: Arial; font-size: 20px; color: black;">
                        {{FrontSide}}
                    </div>
                    <hr id="answer">
                    <div style="font-family: Arial; font-size: 18px; color: black; text-align: center;">
                        {{Answer}}
                    </div>
                    
                    <br>
                    <hr id="divider">
                    <div style="font-family: Arial; font-size: 16px; color: black;">
                        Explanation: {{Explanation}}
                    </div>
                </div>
            '''
        }

    fields = [
        {'name': 'Question'},
        {'name': 'Answer'},
        {'name': 'Explanation'},  
    ]

# Define a model with additional fields in afmt
model1 = genanki.Model(
    1607392321,
    'Model with Additional Fields in Answer Format',
    fields=fields,
  
    templates=[
        template,
    ]
)


In [13]:

create_anki_deck = True

if create_anki_deck:

    date = datetime.today().strftime('%y_%m_%d')

    sound_file_path = os.path.join(data_dir, "results", "sound_files")

    type_ = compute_for[:-1]

    detailed_str = "_detailed" if detailed else ""
    
    deck_title = f'{use_lang.title()} Deck - {compute_for.title()}{detailed_str}' 
    
    
    for param in params:

        deck = genanki.Deck(
            2059400113, deck_title
        )
        
        path = os.path.join(result_path, f"{use_lang}_{type_}_data_" + utils.dict_to_str(param) + detailed_str)
        person = Person(language)
        person.load_data(path, type)
        data = person.get_data(compute_for)
        
        media_files = []
        
        # num_sentences_in_deck = 1750
        
        for idx, item in enumerate(data.selected):
            
            # if idx == num_sentences_in_deck+1:
            #     break
            
            sentence = item[compute_for[:-1]] 
            explanation = item['explanation']
            translations = item['translations'] 
            
            if explanation is None:
                print("skip")
                continue
            
            translations_str = ""
            if compute_for == "sentences":
                for trans in translations:
                    translations_str += trans + "<br>"
            else:
                translations_str = translations
                print(translations_str)
                    
            explanation = explanation.replace("\n", "<br>")
            
            fn = f"{use_lang}_{compute_for}_{date}_{idx}{detailed_str}.mp3"

            if use_sound:
                media_files.append(f"{sound_file_path}/{fn}")

                card = genanki.Note(model=model1, fields=[sentence, translations_str, explanation, f'[sound:{fn}]', ''])
                deck.add_note(card)

                card = genanki.Note(model=model1, fields=[translations_str, sentence, explanation, '', f'[sound:{fn}]'])
                deck.add_note(card)
            else:
                card = genanki.Note(model=model1, fields=[sentence, translations_str, explanation])
                deck.add_note(card)

                card = genanki.Note(model=model1, fields=[translations_str, sentence, explanation])
                deck.add_note(card)
            
        # Save the deck to a file, including the media files
        if use_sound:
            genanki.Package(deck, media_files).write_to_file(deck_title + ".apkg")
        else:
            genanki.Package(deck).write_to_file(deck_title + ".apkg")
            

 The given Hindi sentence 'टॉम नहीं है' can be broken down as follows:

1. 'टॉम' (Tom) is a proper noun, referring to the name of a person.
2. 'नहीं' (nahin) is a negation particle, which negates the meaning of the following verb.
3. 'है' (hai) is the present tense form of the verb 'होना' (hona - to be), which indicates that the subject (implicitly 'it') does not exist or is not present.
 The given Hindi sentence 'वह नहीं है' can be broken down as follows:

1. 'वह' (vah) is a pronoun meaning 'that' or 'he' or'she' or 'it,' depending on the context. In this sentence, it refers to the object of the previous sentence that is being denied as the speaker's referent.
2. 'नहीं' (nahin) is a negation particle, which negates the meaning of the verb 'होना' (hona - to be), indicating that the subject does not exist or is not present.
3. 'है' (hai) is the present tense form of the verb 'होना' (hona - to be), which is used to express the existence or presence of a subject.

Therefore, the sentence 


### Plots

In [12]:
plot = True

if plot:
    
    if compute_for == "words":
        xlabel = "Words"
        ylabel = "Letters"
    else:
        xlabel = "Sentences"
        ylabel = "Words"
        
    lists_of_sum_items_per_iter, lists_of_new_items_per_sentence, lists_of_sum_covered_freq_per_iter, lists_of_covered_freq_per_iter, legend_labels = \
    utils.extract_data(params, language, result_path, use_lang, type=compute_for)
    
    statistics.interactive_line_plot(lists_of_new_items_per_sentence, xlabel=xlabel, ylabel=ylabel, title=f"New {ylabel} per {xlabel}", legend_labels=legend_labels)
    statistics.interactive_line_plot(lists_of_sum_items_per_iter, xlabel=xlabel, ylabel=ylabel, title=f"Total num unique {ylabel} over {xlabel}", legend_labels=legend_labels)

    if compute_for == "words":
        total_freq = sum([v for (_,v) in language.letter_freq_dict.items()])
    else:
        total_freq = sum([v for (_,v) in language.word_freq_dict.items()])
        
    statistics.interactive_line_plot(lists_of_covered_freq_per_iter, xlabel=xlabel, ylabel="Covered Frequency", title=f"Covered Frequency per {xlabel}", legend_labels=legend_labels) 

    lists_of_sum_covered_freq_per_iter_ = []

    for sum_covered_freq_per_iter in lists_of_sum_covered_freq_per_iter:
        
        sum_covered_freq_per_iter_ = np.array(sum_covered_freq_per_iter, dtype=np.float32)
        sum_covered_freq_per_iter_ /= total_freq
        sum_covered_freq_per_iter_ *= 100
        
        lists_of_sum_covered_freq_per_iter_.append(sum_covered_freq_per_iter_)

    statistics.interactive_line_plot(lists_of_sum_covered_freq_per_iter_, xlabel=xlabel, ylabel="Covered Frequency (%)", title="Total percentage of covered frequency", ylim=(0,100), legend_labels=legend_labels)


interactive(children=(IntSlider(value=1000, description='X Range:', max=1000, min=2), IntSlider(value=1, descr…

interactive(children=(IntSlider(value=1000, description='X Range:', max=1000, min=2), IntSlider(value=1, descr…

interactive(children=(IntSlider(value=1000, description='X Range:', max=1000, min=2), IntSlider(value=1, descr…

interactive(children=(IntSlider(value=1000, description='X Range:', max=1000, min=2), IntSlider(value=1, descr…