In [26]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [27]:
ROOT_PATH = Path(".").resolve().parents[0] # please change here
INPUT_PATH = ROOT_PATH / "input" / "bengaliai-cv19" / "train_images"
train_df = pd.read_csv(ROOT_PATH / "input" / "bengaliai-cv19" / "train.csv")

In [37]:
le = LabelEncoder()
le = le.fit(train_df['grapheme'])
train_df['char'] = le.transform(train_df['grapheme'])

In [73]:
train_df['unique_label'] = train_df["grapheme_root"] * 1 + train_df["vowel_diacritic"] * 1000 + train_df["consonant_diacritic"] * 100000
for i in train_df["unique_label"].value_counts().keys():
    tmp_df = train_df.query(f"unique_label=={i}")
    if len(tmp_df["grapheme"].value_counts().keys()) > 1:
        print(tmp_df["grapheme"].value_counts())
        print(tmp_df["grapheme_root"].value_counts())
        print(tmp_df["vowel_diacritic"].value_counts())
        print(tmp_df["consonant_diacritic"].value_counts())
        print(len(tmp_df) / len(train_df) * 100)
        print("===========================================")

র্তে      153
র্ত্রে    150
Name: grapheme, dtype: int64
64    303
Name: grapheme_root, dtype: int64
7    303
Name: vowel_diacritic, dtype: int64
2    303
Name: consonant_diacritic, dtype: int64
0.15086636128261302
র্দ্র    151
র্দ      146
Name: grapheme, dtype: int64
72    297
Name: grapheme_root, dtype: int64
0    297
Name: vowel_diacritic, dtype: int64
2    297
Name: consonant_diacritic, dtype: int64
0.14787890858394742
র্ত্রী    145
র্তী      144
Name: grapheme, dtype: int64
64    289
Name: grapheme_root, dtype: int64
3    289
Name: vowel_diacritic, dtype: int64
2    289
Name: consonant_diacritic, dtype: int64
0.14389563831905994


## g, v, c = 72, 0, X

In [61]:
train_df.query("grapheme_root==72 and vowel_diacritic==0")["consonant_diacritic"].value_counts()

2    297
4    150
5    149
0    148
6    139
Name: consonant_diacritic, dtype: int64

In [66]:
a = train_df.query("grapheme_root==72 and vowel_diacritic==0")["grapheme"].value_counts()
for k, v in a.items():
    b = list(k)
    print(k, v, b)

র্দ্র 151 ['র', '্', 'দ', '্', 'র']
দ্য 150 ['দ', '্', 'য']
দ্র 149 ['দ', '্', 'র']
দ 148 ['দ']
র্দ 146 ['র', '্', 'দ']
দ্র্য 139 ['দ', '্', 'র', '্', 'য']


## g, v, c = 72, X, 2

In [55]:
train_df.query("grapheme_root==72 and consonant_diacritic==2")["vowel_diacritic"].value_counts()

0    297
2    166
7    165
6    150
4    145
3    144
9    143
1    143
Name: vowel_diacritic, dtype: int64

In [67]:
a = train_df.query("grapheme_root==72 and consonant_diacritic==2")["grapheme"].value_counts()
for k, v in a.items():
    b = list(k)
    print(k, v, b)

র্দি 166 ['র', '্', 'দ', 'ি']
র্দে 165 ['র', '্', 'দ', 'ে']
র্দ্র 151 ['র', '্', 'দ', '্', 'র']
র্দৃ 150 ['র', '্', 'দ', 'ৃ']
র্দ 146 ['র', '্', 'দ']
র্দু 145 ['র', '্', 'দ', 'ু']
র্দী 144 ['র', '্', 'দ', 'ী']
র্দা 143 ['র', '্', 'দ', 'া']
র্দো 143 ['র', '্', 'দ', 'ো']


Which is (72, 0, 2)?

র্দ্র & র্দ -> র্দ

## g, v, c = 64, X, 2

In [48]:
train_df.query("grapheme_root==64 and consonant_diacritic==2")["vowel_diacritic"].value_counts()

7    303
3    289
2    167
4    151
1    150
0    150
6    147
Name: vowel_diacritic, dtype: int64

In [65]:
a = train_df.query("grapheme_root==64 and consonant_diacritic==2")["grapheme"].value_counts()
for k, v in a.items():
    b = list(k)
    print(k, v, b)

র্তি 167 ['র', '্', 'ত', 'ি']
র্তে 153 ['র', '্', 'ত', 'ে']
র্তু 151 ['র', '্', 'ত', 'ু']
র্তা 150 ['র', '্', 'ত', 'া']
র্ত্রে 150 ['র', '্', 'ত', '্', 'র', 'ে']
র্ত 150 ['র', '্', 'ত']
র্তৃ 147 ['র', '্', 'ত', 'ৃ']
র্ত্রী 145 ['র', '্', 'ত', '্', 'র', 'ী']
র্তী 144 ['র', '্', 'ত', 'ী']


## g, v, c = 64, 7, X

In [70]:
train_df.query("grapheme_root==64 and vowel_diacritic==7")["consonant_diacritic"].value_counts()

2    303
4    170
5    166
0    155
1    147
Name: consonant_diacritic, dtype: int64

In [71]:
a = train_df.query("grapheme_root==64 and vowel_diacritic==7")["grapheme"].value_counts()
for k, v in a.items():
    b = list(k)
    print(k, v, b)

ত্যে 170 ['ত', '্', 'য', 'ে']
ত্রে 166 ['ত', '্', 'র', 'ে']
তে 155 ['ত', 'ে']
র্তে 153 ['র', '্', 'ত', 'ে']
র্ত্রে 150 ['র', '্', 'ত', '্', 'র', 'ে']
তেঁ 147 ['ত', 'ে', 'ঁ']
