In [1]:
import pickle
import re
from suffixtree import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [143]:
# Stage 1 Filtering (Don't Run).
def read_filter_write(file, source_dir, target_dir, un_dir):
    with open(source_dir + un_dir + '/' + file, 'r') as f:
        data = f.read().replace('\n', ' ')    
    
    data = filtering(data)
   
    with open(target_dir + un_dir + '/' + file, "w") as f:
        f.write(data)

    return data

In [144]:
# Removing stopword english numericals, keeping only unicode of devanagri, removing devanagri numericals, 
# removing extra white spaces
def filtering(data):
    patterns = {r'<.*>+': '',
                r'[!@#$%^&*()_+<>|,.:;()+=…&×{}<>"→?\'0-9।-]': '', 
                r'[^\u0900-\u097F ]': '', 
                r'[\u0964-\u096F]': '', 
                r'[\s+]': ' '}

    for pattern, result in patterns.items():    
        data = re.sub(pattern, result, data)
        
    return data

In [265]:
source_dir = 'Unfiltered/'
unfiltered_dirs = os.listdir(source_dir)
target_dir = 'Filtered/'

for un_dir in unfiltered_dirs:
    files = os.listdir(source_dir + un_dir)
    for file in files:
        read_filter_write(file, source_dir, target_dir, un_dir)        

In [175]:
# Stage 2 Reading filtered files for each language and storing only unique words into a new folder. (Don't Run)
# Keep unique words with len > 2 as pickled lists
filtered_dirs = os.listdir(target_dir)

for f_dir in filtered_dirs:
    files = os.listdir(target_dir + f_dir)
    word_list = []
    
    for file in files:
        with open(target_dir + f_dir + '/' + file, 'r') as f:
            data = f.read()
        words = data.split(' ')
        words = list((filter(None, words)))   
        word_list += [word for word in words if len(word) > 2]
    
    word_list = list(set(word_list))
    
    with open('Words List/' + f_dir + '.pkl', 'wb') as f:
        pickle.dump(word_list, f)                    

In [2]:
# Run notebook from here
with open('Words List/Hindi.pkl', 'rb') as f:
    hindi_list = pickle.load(f)    
    
with open('Words List/Bhojpuri.pkl', 'rb') as f:
    bhojpuri_list = pickle.load(f)    
    
with open('Words List/Magahi.pkl', 'rb') as f:
    magahi_list = pickle.load(f)        
    
with open('Words List/Maithili.pkl', 'rb') as f:
    maithili_list = pickle.load(f)        

In [3]:
# Unique words in each language
print(len(hindi_list))
print(len(bhojpuri_list))
print(len(magahi_list))
print(len(maithili_list))

135305
36551
33497
25661


In [4]:
# LCS Substring Matching
ortho_languages = [bhojpuri_list, magahi_list, maithili_list]
langs = ['Bhojpuri', 'Magahi', 'Maithili']
cols = ['Language', 'Word', 'Length', 'Empty Match', 'Partial Matches', 'Correct Match']

In [264]:
# pickle tree ...
tree = SuffixTree(True, hindi_list)

In [228]:
def process_word(language, word, match_list):
    datum = {'Language': language, 'Word': word, 'Length': len(word), 'Empty Match': True, 
          'Partial Matches': match_list, 'Correct Match': False}
    
    if not match_list:
        return datum
    
    datum['Empty Match'] = False
    correct_match = list(set(match_list).intersection([word]))
    if not correct_match:
        return datum
    else:
        datum['Correct Match'] = True
        return datum

In [2]:
def average_matches(df):
    return len(df[df['Empty Match'] == False])/len(df['Empty Match'])

In [271]:
def plot_graph(df, lang):
    df1 = df.groupby(['Length']).sum()
    y_pos = np.arange(len(df1))
    plt.figure(figsize=(9,6))
    bars = plt.bar(y_pos, df1['Correct Match'], alpha=0.7, align='center', color='lightgreen')
    plt.xticks(y_pos,df1.index )
    plt.subplots_adjust(bottom=0.3)    
    plt.title(f'Exact Matches vs. Length of Words for {lang}')
    plt.tick_params(top=False, bottom=False, left=False, right=False, labelleft=False, labelbottom=True)

    for spine in plt.gca().spines.values():
        spine.set_visible(False)

    for bar in bars:
        plt.gca().text(bar.get_x() + bar.get_width()/2, bar.get_height(), 
                       bar.get_height(), ha='center', fontsize=8)

    x = plt.gca().xaxis
    plt.savefig(f'Stat Images/Stat for{lang}.png')    

In [273]:
for lang, lang_list in zip(langs, ortho_languages):
    print(lang)
    df  = pd.DataFrame(columns = cols)
    for word in lang_list:
        match_list = tree.findString(word)
        datum = process_word(lang, word, match_list)    
        df = df.append(datum, ignore_index=True)
        
    plot_graph(df, lang)
    df.to_csv('LCS Stats/' + lang + '_stats.csv', index=False)    

Bhojpuri
<function average_matches at 0x7ff0738bfb70>


<IPython.core.display.Javascript object>

Magahi
<function average_matches at 0x7ff0738bfb70>


<IPython.core.display.Javascript object>

Maithili
<function average_matches at 0x7ff0738bfb70>


<IPython.core.display.Javascript object>

In [6]:
df_maithili = pd.read_csv('LCS Stats/Maithili_stats.csv')
print(average_matches(df_maithili))
df_bhojpuri = pd.read_csv('LCS Stats/Bhojpuri_stats.csv')
print(average_matches(df_bhojpuri))
df_magahi = pd.read_csv('LCS Stats/Magahi_stats.csv')
print(average_matches(df_magahi))

0.3914500604029461
0.44269650625153895
0.41890318535988297


In [13]:
print(100*len(df_maithili[df_maithili['Correct Match'] == True])/len(df_maithili))
print(100*len(df_bhojpuri[df_bhojpuri['Correct Match'] == True])/len(df_bhojpuri))
print(100*len(df_magahi[df_magahi['Correct Match'] == True])/len(df_magahi))

30.228751802345972
34.60917621952888
34.558318655401976


In [5]:
# n-gram similarity
x = 'afad'
y = 'fwads'
def ngram_similarity(x, y, n=1):
    k = len(x)
    l = len(y)
    L = [[0]*(l+1) for i in range(k+1)] 
    for i in range(k+1):
        for j in range(l+1):
            if i == 0 or j == 0:
                L[i][j] = 0        
            else:
                count = 0
                for u in range(n):
                    if i+u <= k and j+u <= l:
                        if x[i-1+u] == y[j-1+u]:
                            count +=1

                pos_ngram = (1/n)*count
                L[i][j] = max(L[i-1][j], L[i][j-1], L[i-1][j-1] + pos_ngram)

    return round(L[k][l]/max(k,l), 3)

ngram_similarity(x, y, 1)

0.6

In [55]:
# Character n-gram model
# dice = 2*|ngram(x) intersection ngram(y)|/|ngram(x) union ngram(y)|
def ngram(q, n=2):
    return [q[i:i+n] for i in range(len(q)-n+1)]
print(ngram('हरलाखीसँ'))
print(ngram('लीलादेवी'))


def dice(x, y, n=2):
    intersection = len(set(ngram(x, n)).intersection(ngram(y, n)))
    union = len(ngram(x, n) + ngram(y, n))
    dice  = 2*intersection/union
    return round(dice, 3)

x = 'colour'
y = 'couleur'
n = 2
dice(x, y)

['sa', 'ad', 'da', 'as']
['sa', 'ad', 'da', 'as']


1.0

In [65]:
# n-gram similarity between orthographic languages
cols = ['Hindi Words']
words_taken = 300
for lang, lang_list in zip(langs, ortho_languages):
    cols = ['Hindi Words']
    cols += lang_list[:words_taken]
    df = pd.DataFrame(columns = cols)
    df['Hindi Words'] = hindi_list[:words_taken]
    for hindi_word in hindi_list[:words_taken]:
        for lang_word in lang_list[:words_taken]:            
            dice_val = dice(hindi_word, lang_word)
            df.loc[np.where(df['Hindi Words'] == hindi_word)[0][0], lang_word] = dice_val
                                
    print(lang)    
    df['Max Similarity'] = df.iloc[:, 1:-1].max(axis=1)
    df['Similarity'] = df.iloc[:, 1:-1].astype('float64').idxmax(axis=1)
    
    df.to_csv('Dice stats/' + lang + '_stats.csv', index=False)    

Bhojpuri
Magahi
Maithili


In [74]:
df_maithili = pd.read_csv('Dice stats/Maithili_stats.csv')
df_bhojpuri = pd.read_csv('Dice stats/Bhojpuri_stats.csv')
df_magahi = pd.read_csv('Dice stats/Magahi_stats.csv')
langs = ['Bhojpuri', 'Magahi', 'Maithili']

dataframes = [df_bhojpuri, df_magahi, df_maithili]
for lang, df in zip(langs, dataframes):    
    df = df[df['Max Similarity'] != 1].sort_values('Max Similarity', ascending=False)
    df = df[['Hindi Words', 'Max Similarity', 'Similarity']][:10]
    df.to_csv('Dice stats/' + lang + '_final.csv')

In [98]:
df = pd.read_csv('Dice stats/Maithili_final.csv')
df

Unnamed: 0.1,Unnamed: 0,Hindi Words,Max Similarity,Similarity
0,182,निर्माणजो,0.75,नवनिर्माण
1,135,पहाड़ियां,0.545,उड़िया
2,234,मान्यनहीं,0.533,मान्यताक
3,92,सिद्धता,0.533,सुप्रसिद्ध
4,85,कहाआदमी,0.5,आदम
5,51,शलाकाएँ,0.5,कलाकारस
6,159,पूरी,0.5,मंजूरी
7,165,प्रांगण,0.5,प्रसंगक
8,201,अकाली,0.5,चालीस
9,255,दांत,0.5,दांवपर


In [81]:
# DICE similarity between orthographic languages
words_taken = 300

for lang, lang_list in zip(langs, ortho_languages):
    cols = ['Hindi Words']
    cols += lang_list[:words_taken]
    df = pd.DataFrame(columns=cols)
    df['Hindi Words'] = hindi_list[:words_taken]
    for hindi_word in hindi_list[:words_taken]:
        for lang_word in lang_list[:words_taken]:
            dice_val = ngram_similarity(hindi_word, lang_word, 3)
            df.loc[np.where(df['Hindi Words'] == hindi_word)
                   [0][0], lang_word] = dice_val

    print(lang)
    df['Max Similarity'] = df.iloc[:, 1:-1].max(axis=1)
    df['Similarity'] = df.iloc[:, 1:-1].astype('float64').idxmax(axis=1)
    df.to_csv('n-gram similarity stats/' + lang + '_stats.csv', index=False)

Bhojpuri
Magahi
Maithili


In [87]:
df_maithili = pd.read_csv('n-gram similarity stats/Maithili_stats.csv')
df_bhojpuri = pd.read_csv('n-gram similarity stats/Bhojpuri_stats.csv')
df_magahi = pd.read_csv('n-gram similarity stats/Magahi_stats.csv')

dataframes = [df_bhojpuri, df_magahi, df_maithili]

for lang, df in zip(langs, dataframes):
    df = df[df['Max Similarity'] != 1].sort_values(
        'Max Similarity', ascending=False)
    df = df[['Hindi Words', 'Max Similarity', 'Similarity']][:40]
    df.to_csv('n-gram similarity stats/' + lang + '_final.csv')

In [92]:
df = pd.read_csv('n-gram similarity stats/Bhojpuri_final.csv')
df

Unnamed: 0.1,Unnamed: 0,Hindi Words,Max Similarity,Similarity
0,163,सुप्रसिद्ध,0.95,सुप्रसिद्ध
1,100,संवेदनशील,0.944,संवेदनशील
2,49,अलकनन्दा,0.938,अलकनन्दा
3,2,अनुभूति,0.929,अनुभूति
4,203,पचहत्तर,0.929,पचहत्तर
5,238,अमेरिका,0.929,अमेरिका
6,267,जालीदार,0.929,जालीदार
7,273,उत्पादन,0.929,उत्पादन
8,36,चतुर्थ,0.917,चतुर्थ
9,160,दारोगा,0.917,दारोगा


In [None]:
# Thank You