In [1]:
import lib.get_assets as assets

N1, N2, N3, N4, N9, NA, VPr
V1, V2, V3, V4, V5, VPr
C0, C1
Po, N9, Pr, Pr1
A0, A1, A2, 
nan
Ad
No, Nu
Sp, NA
N5, N6, N7, N8, NA


In [2]:
import lib.model0_9 as m0_9

### Rule 1

In [3]:

tests = [{'farsi': 'ییلاق‌نشین', 'trans': 'yeylAqneSin'},
         {'farsi': 'ییدیش', 'trans': 'yidiS'},
         {'farsi': 'یوونتوس', 'trans': 'yuventus'},
         {'farsi': 'یونید', 'trans': 'یونید'} # case verb
        ]

for t in tests:   
    assert m0_9.general_search(t['farsi'], pos_neg='Verb') == t['trans']


### Rule 2 (Nouns)

In [4]:

tests = [{'farsi': 'پیامبرت', 'trans': 'payAmbarat'},
         {'farsi': 'مزايایی', 'trans': 'mazAyAyi'},
         {'farsi': 'بی شرف', 'trans': 'bi Saraf'},
         {'farsi': 'بی عقل', 'trans':'bi \'aql'}]

for t in tests:
    assert m0_9.run_transcription_0(t['farsi']) == t['trans']


### Rule 3 (Verbs basis)

In [5]:

tests = [{'farsi': 'بخواهند بروند', 'trans': 'bexAhand beravand'},
         # {'farsi': 'می‌تواند بخوابد', 'trans': 'mitavAnad bexAbad'}, # \u200c
         # {'farsi': 'نشد بپریم', 'trans': 'naSod beparim'}, # می
         # {'farsi': 'می روید', 'trans': 'miravid'} # 
        # {'farsi': 'می ترسد', 'trans': 'mitarsad'}
        ]

for t in tests:
    assert m0_9.run_transcription_0(t['farsi']) == t['trans']


### Rule 4 (Frequency based prioritization for collisions)

In [6]:

tests = [{'farsi': 'ی', 'trans': 'ye'}]

for t in tests:
    assert m0_9.general_search(t['farsi']) == t['trans']

    
tests = [{'farsi': 'ی', 'trans': 'i'}]
for t in tests:
    assert m0_9.affix_search(t['farsi']) == t['trans']


### Rule 5 (Verb _ as خواهند_كر)

In [7]:

tests = [{'farsi': 'خواهند_كرد', 'trans': 'xAhand kard'}]

for t in tests:
    assert m0_9.process_verb(m0_9.normalise(t['farsi'])) == t['trans']

tests = [{'farsi': 'خواهند كرد', 'trans': 'xAhand kard'},
         {'farsi': 'بخواهند بروند', 'trans': 'bexAhand beravand'}]

for t in tests:
    assert m0_9.run_transcription_0(t['farsi']) == t['trans']
    

### Rule 6 (normalisation ي, ك)

In [8]:

tests = [{'farsi': 'گزارشي', 'trans': 'گزارشی'},
         {'farsi': 'ك', 'trans': 'ک'}]

for t in tests:
    assert m0_9.normalise(t['farsi']) == t['trans']


### Rule 7 (affix ی)

In [9]:

def rule_7(wrd, pos=None, pos_last=None):
    if not 'ی'==wrd[-1]:
        return wrd
    else:
        wrd = m0_9.process_wrd(wrd, pos)
        if not pos_last=='Verb':
            if wrd[-1]=='i':
                wrd = wrd[:-1] + 'ye'
        return wrd
    
tests = [{'farsi': 'بانوی', 'trans': 'bAnuye', 'pos': 'Noun', 'pos_last': 'Adjective'}, 
         {'farsi': 'چیزی', 'trans': 'Cizi', 'pos': 'Noun', 'pos_last': 'Verb'}]

for t in tests:
    assert rule_7(t['farsi'], pos=t['pos'],  pos_last=t['pos_last']) == t['trans']


### Rule 8 (letter ۀ)

In [10]:

def rule_8(wrd, pos=None):
    if not wrd[-1] in ['ٔ', 'ۀ']:
        return wrd
    else:
        wrd = m0_9.process_wrd(wrd[:-1], pos=pos)
        if wrd[-1] == 'e':
            wrd += 'ye'
        else:
            wrd += 'eye'
        return wrd

tests = [{'farsi': 'همۀ', 'trans': 'hameye', 'pos': 'Determiner'}, 
         {'farsi': 'بچهٔ', 'trans': 'baCCeye', 'pos': 'Verb'}]

for t in tests:
    # print(rule_8(t['farsi'], pos=t['pos']))
    assert rule_8(t['farsi'], pos=t['pos']) == t['trans']



### Rule 9 (collision affix ات)

In [11]:

def rule_9(wrd, pos=None):
    if not 'ات' in wrd:
        return wrd
    else:
        if 'ات‌' in wrd:
            d_affixes = m0_9.get_affixes(wrd, 'ات‌') #, stem)
            stem = 'At'
        elif '\u200cات' in wrd:
            d_affixes = m0_9.get_affixes(wrd, '\u200cات')
            stem = '\'at'
        else:
            d_affixes = m0_9.get_affixes(wrd, 'ات')
            stem = 'At'

    if len(d_affixes['prefix']) > len(d_affixes['suffix']):
        return m0_9.general_search(d_affixes['prefix'], pos_pos=pos) + stem + m0_9.affix_search(d_affixes['suffix'] )
    else:
        return m0_9.affix_search(d_affixes['prefix']) + stem + m0_9.general_search(d_affixes['suffix'], pos_pos=pos)


tests = [{'farsi': 'مزخرفات', 'trans': 'mozaxrafAt'},
         {'farsi': 'خاطرات‌مان', 'trans': 'xAterAtemAn'},
         {'farsi': 'عمه‌ات', 'trans': "'amme'at"}]


for t in tests:
    assert rule_9(m0_9.normalise(t['farsi'])) == t['trans']


### Rule 10 (collision affix ان)

In [12]:

def rule_10(wrd, pos=None):
    if not 'ان' in wrd:
        return wrd
    else:
        if len(wrd) == 2:
            return 'An'
        elif wrd[3] == 'ی':
            return m0_9.general_search(wrd[:-2], pos_pos=pos) + 'yAn'
        else: 
            return m0_9.general_search(wrd[:-2], pos_pos=pos) + 'An'

        
tests = [{'farsi': 'بانیان', 'trans': 'bAniyAn', 'pos': 'Noun'},
         {'farsi': 'دیگران', 'trans': 'digarAn', 'pos': 'Preposition'}]


for t in tests:
    assert rule_10(m0_9.normalise(t['farsi'])) == t['trans']


### Rule 11 (collision affix ش)

In [13]:

def rule_11(wrd, pos=None):
    if 'ش' != wrd[-1]:
        return wrd
    else:
        if pos == 'Noun':
            return m0_9.process_noun(wrd[:-1])+'aS'
        elif pos == 'Verb':
            return m0_9.process_verb(wrd[:-1])+'eS'
        else:
            return m0_9.general_search(wrd[:-1], pos_pos=pos)+'eS'

tests = [{'farsi': 'وسایلش', 'trans':'vasAyelaS', 'pos': 'Noun'},
         {'farsi': 'بردندش', 'trans': 'bordandeS', 'pos': 'Verb'}]

for t in tests:
    assert rule_11(t['farsi'], t['pos']) == t['trans']


### Rule 12 (affix م)

In [14]:

tests = [{'farsi': 'چندم', 'trans':'Candom', 'pos': 'Number'}]

for t in tests:
    assert m0_9.process_wrd(t['farsi'], t['pos']) == t['trans']


### Rule 13 (affix مان)

In [15]:

def rule_13(wrd, pos=None):
    if len(wrd) > 3:
        if wrd[-4:] == '\u200cمان':
            wrd = m0_9.process_wrd(wrd[:-4], pos)
            wrd += 'mAn' if wrd[-1] in ['a','e','o','A','i','u'] else 'emAn'
            return wrd
    return wrd

tests = [{'farsi': 'برای‌مان', 'trans':'barAyemAn', 'pos': 'Preposition'},
         {'farsi': 'خاطرات‌مان', 'trans':'xAterAtemAn', 'pos': 'Noun'},
         # {'farsi': 'کردن‌مان', 'trans': 'kardanemAn', 'pos': 'Noun'}
        ]

for t in tests:
    # print(rule_13(t['farsi'], t['pos']))
    assert rule_13(t['farsi'], t['pos']) == t['trans']


### Rule 14 (affix می)

In [16]:

def rule_14(wrd, pos=None):
    if not 'می' in wrd:
        return wrd
    else:
        if 'می‌' == wrd[:3]:
            return 'mi'+m0_9.process_wrd(wrd[3:], pos)
        elif 'می' == wrd[-2:]:
            return m0_9.general_search(wrd[:-2]) + 'omi'
        
tests = [{'farsi': 'می‌تواند', 'trans': 'mitavAnad', 'pos': 'Verb'},
         {'farsi': 'چندمی', 'trans': 'Candomi'}#, 
         #{'farsi': 'sdd', 'trans': 'Amjsnkjs'}
        ]

for t in tests:
    assert rule_14(t['farsi'], t.get('pos', None)) == t['trans']


### Rule 15 (affix آ)

In [17]:

text = 'بیا در آغوشم بیارام و دیگران را نیازار'
# 'biyA dar 'AquSam biyArAm va digarAn rA nayAzAr'
#assets.tagger.tag(assets.word_tokenize(text))

### Rule 16 (affix ون)

In [69]:

def rule_16(wrd, pos=None):
    if not 'ون' == wrd[-2:]:
        return wrd
    else:
        w = m0_9.process_wrd(wrd, pos) 
        # print('dbg: ', w)
        if w != wrd:
            return w
        else:
            w = m0_9.process_wrd(wrd[:-2], pos)
            w += 'yun' if w[-1] == 'i' else 'un'
            return w
        
tests = [{'farsi': 'سرنگون', 'trans': 'sarnegun', 'pos': 'Adjective'},
         {'farsi': 'حواریون', 'trans': 'havAriyun', 'pos': 'Verb'},
         {'farsi': 'منافقون', 'trans': 'monAfequn', 'pos': 'Noun'}]


for t in tests:
    # print(rule_16(t['farsi'], t.get('pos', None)))
    assert rule_16(t['farsi'], t.get('pos', None)) == t['trans']


### Rule 17 (affix ید)

In [15]:

def rule_17(wrd, pos=None):
    if not 'ید' in wrd:
        return wrd
    elif pos == 'Verb':
        l_lemma = [w for w in assets.lemmatizer.lemmatize(wrd).split('#') if w == wrd[:len(w)]]
        if len(l_lemma) > 0:
            # print(1)
            return m0_9.process_verb(wrd[:-len('ید')]) + 'id'
        else:
            return m0_9.process_verb(wrd[:-len('ید')])+'yad'
    else:
        return wrd # ???
    
tests = [{'farsi': 'رفتید', 'trans': 'raftid', 'pos': 'Verb'},
         {'farsi': 'بگوید', 'trans': 'beguyad', 'pos': 'Verb'},
         #{'farsi': 'می‌آید', 'trans': 'beguyad', 'pos': 'Verb'}
        ]

for t in tests:
    assert rule_17(t['farsi'], t.get('pos', None)) == t['trans']


### Rule 18 (verb with یم)

In [18]:

tests = [{'farsi': 'بپریم', 'trans': 'beparim', 'pos': 'Verb'},
         {'farsi': 'رفتیم', 'trans': 'raftim', 'pos': 'Verb'},
         {'farsi': 'خدایم', 'trans': 'xodAyam', 'pos': 'Noun'}]

for t in tests:
    assert m0_9.process_wrd(t['farsi'], t.get('pos', None)) == t['trans']


### Rule 19 (Semispace u200c, improve imlementation in 0.9)

In [49]:

def rule_19(wrd, pos):
    if not '\u200c' in wrd:
        return wrd
    else:
        l_wrd = wrd.split('\u200c')
        M = max([len(w) for w in l_wrd])
        _str = []
        for i in range(len(l_wrd)):
            w = l_wrd[i]
            if len(w) == M:
                _str.append(m0_9.process_wrd(w, pos))
            else:
                w_tmp = m0_9.process_verb(w)
                if w != w_tmp:
                    _str.append(w_tmp)
                else:
                    _str.append(recu_affixes(w))
    return ''.join(_str)
                    
tests = [{'farsi': 'دیده_می\u200cشود', 'trans': 'dide miSavad', 'pos': 'Verb'},
         {'farsi': 'بی\u200cبصیرتی\u200cهایی', 'trans': 'bibasiratihAyi', 'pos': 'Noun'},
         {'farsi': 'چراغ‌علی', 'trans': "CerAq'ali", 'pos': 'Noun'}]

for t in tests:
    #print(rule_19(t['farsi'], t['pos']))
    assert rule_19(t['farsi'], t['pos']) == t['trans']


### Rule 20 (Recursive search, improve implemented in 0.9)

In [533]:

def recu_affix_wrd(wrd):
    
    suffix, stem = '', ''
    for i in range(len(wrd), 0, -1):
        if assets.df_Affixes[assets.df_Affixes['Affix']==wrd[:i]].shape[0] > 0:
            l_search = assets.df_Affixes[assets.df_Affixes['Affix']==wrd[:i]].to_dict('records')
            stem = m0_9.votation_entries(l_search, entries=False)
            suffix = recu_affix_wrd(wrd[i:])
            break

    return stem + suffix

tests = [{'farsi': 'هایی', 'trans': 'hAyi'},
         {'farsi': 'هایتان', 'trans': 'hAyetAn'}]

for t in tests:
    assert recu_affix_wrd(t['farsi']) == t['trans']


### Rule 21 (affix ن)

In [39]:

def rule_21(wrd, pos=None):
    if not (wrd[-1] == 'ن' or wrd[0] == 'ن'):
        return m0_9.process_wrd(wrd, pos)
    else:
        if wrd[-1] == 'ن':
            return rule_21(wrd[:-1], pos='Verb') + 'an'
        elif wrd[0] == 'ن':
            return 'na' + rule_21(wrd[1:], pos='Verb')


tests = [{'farsi': 'نخوردن', 'trans': 'naxordan'},
         {'farsi': 'نخوابیدن', 'trans': 'naxAbidan'},
         #{'farsi': 'نمان', 'trans': 'namAn'}
         ]

for t in tests:
    #print(rule_21(t['farsi']))
    assert rule_21(t['farsi']) == t['trans']


### Rule 22 (affixes بی and نی)

In [52]:

def rule_22(wrd, pos=None):
    if not wrd[:2] in ['بی', 'نی']:
        return wrd
    else:
        suffix = ''.join([s for s in recu_affixes(wrd[2:])
                          if s!='\''])
        return m0_9.affix_search(wrd[:2]) + suffix


tests = [{'farsi': 'بیا', 'trans': 'biyA'},
         #{'farsi': 'نیازار', 'trans': 'nayAzAr'},
         #{'farsi': 'نمان', 'trans': 'namAn'}
         #{'farsi': 'بیارام', 'trans': 'biyArAm'}
         ]

for t in tests:
    # print(rule_22(t['farsi']))
    assert rule_22(t['farsi']) == t['trans']



### Rule 23 (root of the verb is رو)

In [53]:

def rule_23(wrd, pos=None):
    if not wrd[-2:] == 'رو':
        return wrd
    else:
        l_lemma = assets.lemmatizer.lemmatize(wrd).split('#')
        if len(l_lemma) > 1:
            lemma = [l for l in l_lemma if l=='رو'][0]
            d_affixes = m0_9.get_affixes(wrd, lemma)
            prefix = d_affixes['prefix']
            return m0_9.affix_search(wrd[:-2]) + 'ro'
        else:
            d_affixes = m0_9.get_affixes(wrd, 'رو')
            prefix = d_affixes['prefix']
            return m0_9.affix_search(wrd[:-2]) + 'ro'
    
tests = [{'farsi': 'نرو', 'trans': 'naro'},
         {'farsi': 'برو', 'trans': 'bero'}]

for t in tests:
    assert rule_23(t['farsi']) == t['trans']


### Rule 24 (Recursion, improve v0.9)

In [54]:

def recu_entries(wrd):
    
    for i in range(len(wrd), 0, -1):
        if assets.df_Entries[assets.df_Entries['WrittenForm']==wrd[:i]].shape[0] > 0:
            l_search = assets.df_Entries[assets.df_Entries['WrittenForm']==wrd[:i]].to_dict('records')
            return m0_9.votation_entries(l_search) + recu_entries(wrd[i:])
            break

    return wrd


def recu_affixes(wrd):
    for i in range(len(wrd), 0, -1):
        if assets.df_Affixes[assets.df_Affixes['Affix']==wrd[:i]].shape[0] > 0:
            l_search = assets.df_Affixes[assets.df_Affixes['Affix']==wrd[:i]].to_dict('records')
            return m0_9.votation_entries(l_search, entries=False) + recu_affixes(wrd[i:])
            break

    return wrd


tests = [{'farsi': 'فیسبوک', 'trans': 'fisbuke'}]

for t in tests:
    assert recu_entries(t['farsi']) == t['trans']

tests = [{'farsi': 'ازار', 'trans': 'AzAr'}]

for t in tests:
    assert recu_affixes(t['farsi']) == t['trans']


In [None]:
# text = 'دستشان در دستتان و دستم در دستت! واتساپ و فیسبوک با ما چه کرده‌اند؟ دست بزنید!'
# 'dasteSAn dar dastetAn va dastam dar dastat! vAtsAp va fisbuk bA mA Ce karde'and? dast bezanid!'