## IMPORTS AND STANZA MODEL DOWNLOAD

In [1]:
import os
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

cwd = os.getcwd()

In [2]:
import stanza

# downloading stanza models for indian languages & english
for i in ["en", "hi", "mr", "ta", "te"]:
    stanza.download(i)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:44:22 INFO: Downloading default packages for language: en (English) ...
2023-06-15 13:44:23 INFO: File exists: /home/kapilrk04/stanza_resources/en/default.zip
2023-06-15 13:44:26 INFO: Finished downloading models and saved to /home/kapilrk04/stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:44:26 INFO: Downloading default packages for language: hi (Hindi) ...
2023-06-15 13:44:27 INFO: File exists: /home/kapilrk04/stanza_resources/hi/default.zip
2023-06-15 13:44:28 INFO: Finished downloading models and saved to /home/kapilrk04/stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:44:28 INFO: Downloading default packages for language: mr (Marathi) ...
2023-06-15 13:44:32 INFO: File exists: /home/kapilrk04/stanza_resources/mr/default.zip
2023-06-15 13:44:35 INFO: Finished downloading models and saved to /home/kapilrk04/stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:44:35 INFO: Downloading default packages for language: ta (Tamil) ...
2023-06-15 13:44:36 INFO: File exists: /home/kapilrk04/stanza_resources/ta/default.zip
2023-06-15 13:44:37 INFO: Finished downloading models and saved to /home/kapilrk04/stanza_resources.


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:44:37 INFO: Downloading default packages for language: te (Telugu) ...
2023-06-15 13:44:38 INFO: File exists: /home/kapilrk04/stanza_resources/te/default.zip
2023-06-15 13:44:40 INFO: Finished downloading models and saved to /home/kapilrk04/stanza_resources.


## AWESOME-ALIGN ALIGNER

In [3]:
from auxilaries.utils import awesomealign


aligner = awesomealign(modelpath = 'bert-base-multilingual-cased',
                      tokenizerpath = 'bert-base-multilingual-cased')  

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## TOKENIZE AND POS

In [4]:
def get_stanza_info(text, language): #TO ADD - language parameter
    # language accepts - en, hi, ta, te
    nlp_lang = stanza.Pipeline(lang=language, processors='tokenize, pos')
    doc = nlp_lang(text)
    
    sents, tokens, postags = [], [], []
    
    for sentence in doc.sentences:
        sents.append(' '.join([f'{token.text}' for token in sentence.tokens]))
        tokens.append([f'{token.text}' for token in sentence.words])
        postags.append([f'{token.upos}' for token in sentence.words])

    return {"sentences" : sents,
           "tokens" : tokens,
           "postags" : postags}

def create_alignments_token_map(sent_src, sent_tgt, alignments):
    
    token_map = {}
    sent_src = sent_src.split()
    sent_tgt = sent_tgt.split()
    
    for el in alignments.split():
        el = el.split("-")
        try:
            token_map[sent_src[int(el[0])]] = sent_tgt[int(el[1])]
            token_map[sent_tgt[int(el[1])]] = sent_src[int(el[0])]
        except IndexError:
            print("index error")
            print(sent_src, sent_tgt, alignments)
            print("-"*20)
            token_map = None
    
    return token_map

def get_alignment_token_map(en_sent, hi_sent):
    '''
    FOR FAST-ALIGN
    line = f"{en_sent} ||| {hi_sent}"
    alignments = aligner.align(line.strip())
    '''
    
    #awesome-align
    alignments = aligner.get_alignments_sentence_pair(en_sent, hi_sent)
    token_alignment_map = create_alignments_token_map(en_sent, hi_sent, alignments)
    return alignments, token_alignment_map

In [5]:
# TOKENIZE-POS CODE

df_translations_set = pd.read_json("unique_utterances_en_hi_transltions.json")
# df_translations_set = df_translations_set[:1]
print(df_translations_set.shape)
# df_translations_set.head()

lang1_tokenized, lang1_pos, lang2_tokenized, lang2_pos = [], [], [], []

for ind, row in tqdm(df_translations_set.iterrows()):
    
    languages = row.keys()   
    lang1, lang2 = [str(lang) for lang in languages]

    lang1_feats = get_stanza_info(row[lang1], lang1)
    lang2_feats = get_stanza_info(row[lang2], lang2)
    
    lang1_tokenized.append(lang1_feats["tokens"])
    lang2_tokenized.append(lang2_feats["tokens"])    
    
    lang1_pos.append(lang1_feats["postags"])
    lang2_pos.append(lang2_feats["postags"])    

#print(f"en tokens : {lang1_feats['tokens']}")
df_translations_set["lang1"] = lang1
df_translations_set["lang1_tokens"] = lang1_tokenized
df_translations_set["lang1_pos"] = lang1_pos

df_translations_set["lang2"] = lang2
df_translations_set["lang2_tokens"] = lang2_tokenized
df_translations_set["lang2_pos"] = lang2_pos

(1, 2)


0it [00:00, ?it/s]2023-06-15 13:44:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:44:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2023-06-15 13:44:59 INFO: Using device: cpu
2023-06-15 13:44:59 INFO: Loading: tokenize
2023-06-15 13:44:59 INFO: Loading: pos
2023-06-15 13:44:59 INFO: Done loading processors!
2023-06-15 13:44:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-15 13:45:00 INFO: Loading these models for language: hi (Hindi):
| Processor | Package |
-----------------------
| tokenize  | hdtb    |
| pos       | hdtb    |

2023-06-15 13:45:00 INFO: Using device: cpu
2023-06-15 13:45:00 INFO: Loading: tokenize
2023-06-15 13:45:00 INFO: Loading: pos
2023-06-15 13:45:00 INFO: Done loading processors!
1it [00:01,  1.51s/it]


In [6]:
df_translations_set

Unnamed: 0,en,hi,lang1,lang1_tokens,lang1_pos,lang2,lang2_tokens,lang2_pos
0,"Okay, how much does it cost?","ठीक है, इसकी लागत कितनी है?",en,"[[Okay, ,, how, much, does, it, cost, ?]]","[[INTJ, PUNCT, ADV, ADJ, AUX, PRON, VERB, PUNCT]]",hi,"[[ठीक, है, ,, इसकी, लागत, कितनी, है, ?]]","[[ADJ, AUX, PUNCT, PRON, NOUN, PRON, AUX, PUNCT]]"


In [7]:
list_of_alignments, list_of_token_alignment_map = [], []

for ind, row in tqdm(df_translations_set.iterrows()):
    lang1_tokenized_sent = [" ".join(sent_list) for sent_list in row["lang1_tokens"]]
    lang2_tokenized_sent = [" ".join(sent_list) for sent_list in row["lang2_tokens"]]
    alignment_row, token_alignment_map_row = [], []
    try:
        assert len(lang1_tokenized_sent) == len(lang2_tokenized_sent)
        for lang1sent, lang2sent in zip(lang1_tokenized_sent, lang2_tokenized_sent):
            alignments, token_alignment_map = get_alignment_token_map(lang1sent, lang2sent)
            alignment_row.append(alignments)
            token_alignment_map_row.append(token_alignment_map)
        list_of_alignments.append(alignment_row)
        list_of_token_alignment_map.append(token_alignment_map_row)

    except AssertionError:
        alignment_row, token_alignment_map_row = None, None
        list_of_alignments.append(None)
        list_of_token_alignment_map.append(None)



df_translations_set["alignments_awesomealign"] = list_of_alignments
df_translations_set["token_alignment_map_awesomealign"] = list_of_token_alignment_map

list_of_token_alignment_map
df_translations_set

1it [00:00,  9.41it/s]


Unnamed: 0,en,hi,lang1,lang1_tokens,lang1_pos,lang2,lang2_tokens,lang2_pos,alignments_awesomealign,token_alignment_map_awesomealign
0,"Okay, how much does it cost?","ठीक है, इसकी लागत कितनी है?",en,"[[Okay, ,, how, much, does, it, cost, ?]]","[[INTJ, PUNCT, ADV, ADJ, AUX, PRON, VERB, PUNCT]]",hi,"[[ठीक, है, ,, इसकी, लागत, कितनी, है, ?]]","[[ADJ, AUX, PUNCT, PRON, NOUN, PRON, AUX, PUNCT]]",[0-0 0-1 1-2 2-5 3-4 4-6 5-3 6-4 7-7 ],"[{'Okay': 'है', 'ठीक': 'Okay', 'है': 'does', '..."


## HEURISTIC FUNCTION FOR CODEMIX SENTENCE GENERATION

In [42]:
#heuristic
def replace_noun_adj_single_aligned(sent, postags, token_map):
    
    codemixcandidate = ""
    
    for token, token_pos in zip(sent, postags):
        if "NOUN" in token_pos or "ADJ" in token_pos or "PROPN" in token_pos:
            if token in token_map:
                codemixcandidate += f" {token_map[token]}"
            else:
                codemixcandidate += f" {token}"                
        else:
            codemixcandidate += f" {token}"
            
    return codemixcandidate


In [43]:
def get_codemix_candidate(row):
    
    sentence = ""
    for en_sent, en_pos, hi_sent, hi_pos, alignments, token_alignment_map in zip(row["lang1_tokens"], row["lang1_pos"], row["lang2_tokens"], row["lang2_pos"], row["alignments_awesomealign"], row["token_alignment_map_awesomealign"]):
        sentence += replace_noun_adj_single_aligned(hi_sent, hi_pos, token_alignment_map)
    return sentence

            
def get_codemix_candidates_for_dataframe(df):
    codemix_candidates = []
    
    for ind, row in tqdm(df.iterrows()):
        
        cm = get_codemix_candidate(row)

        codemix_candidates.append(cm)
        
    return codemix_candidates



codemix_candidates = get_codemix_candidates_for_dataframe(df_translations_set)

df_translations_set["codemixed-sentences"] = codemix_candidates

codemix_candidates

1it [00:00, 1920.47it/s]

ठीक ADJ
है AUX
, PUNCT
इसकी PRON
लागत NOUN
कितनी PRON
है AUX
? PUNCT





[' Okay है , इसकी cost कितनी है ?']

## OUTPUT

In [44]:
df_translations_set.to_json("train_unique_utterances_en_hi_transltions_token_pos_alignments.json", 
                                          force_ascii = False, 
                                         orient = "records",
                                        indent = 4)

In [45]:
df_translations_set


Unnamed: 0,en,hi,lang1,lang1_tokens,lang1_pos,lang2,lang2_tokens,lang2_pos,alignments_awesomealign,token_alignment_map_awesomealign,codemixed-sentences
0,"Okay, how much does it cost?","ठीक है, इसकी लागत कितनी है?",en,"[[Okay, ,, how, much, does, it, cost, ?]]","[[INTJ, PUNCT, ADV, ADJ, AUX, PRON, VERB, PUNCT]]",hi,"[[ठीक, है, ,, इसकी, लागत, कितनी, है, ?]]","[[ADJ, AUX, PUNCT, PRON, NOUN, PRON, AUX, PUNCT]]",[0-0 0-1 1-2 2-5 3-4 4-6 5-3 6-4 7-7 ],"[{'Okay': 'है', 'ठीक': 'Okay', 'है': 'does', '...","Okay है , इसकी cost कितनी है ?"
