In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
ROOT_PATH = Path(".").resolve().parents[0] # please change here
train_df = pd.read_csv(ROOT_PATH / "input" / "bengaliai-cv19" / "train.csv")

In [8]:
le = LabelEncoder()
le = le.fit(train_df['grapheme'])
train_df['unique_label'] = le.transform(train_df['grapheme'])

In [3]:
grapheme2idx = {grapheme: idx for idx, grapheme in enumerate(train_df.grapheme.unique())}
train_df['grapheme_id'] = train_df['grapheme'].map(grapheme2idx)

In [9]:
train_df.head(5)

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_id,unseen,unique_label
0,Train_0,15,9,5,ক্ট্রো,0,0,44
1,Train_1,159,0,0,হ,1,0,1248
2,Train_2,22,3,5,খ্রী,2,0,103
3,Train_3,53,2,2,র্টি,3,0,888
4,Train_4,71,9,5,থ্রো,4,0,438


In [24]:
sum(train_df["unique_label"].value_counts().tail(55))

7313

In [36]:
unseen_indices = list(train_df["unique_label"].value_counts().tail(55).index)

In [42]:
train_df["unseen"] = 0
train_df.loc[train_df.unique_label.isin(unseen_indices), "unseen"] = 1

In [45]:
df = train_df.query("unseen==0")
df.head(5)

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_id,unseen,unique_label
0,Train_0,15,9,5,ক্ট্রো,0,0,44
1,Train_1,159,0,0,হ,1,0,1248
2,Train_2,22,3,5,খ্রী,2,0,103
3,Train_3,53,2,2,র্টি,3,0,888
4,Train_4,71,9,5,থ্রো,4,0,438


In [46]:
random_state = 1116
ratio = 0.1
train, valid = train_test_split(df, test_size=ratio, random_state=random_state, stratify=df["unique_label"])

In [47]:
valid = pd.concat([valid, train_df.query("unseen==1")])

In [51]:
train.head(5)

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_id,unseen,unique_label
141176,Train_141176,97,7,0,প্টে,401,0,646
61731,Train_61731,107,2,4,ব্যি,361,0,729
7555,Train_7555,107,1,6,ব্র্যা,1197,0,740
82663,Train_82663,113,8,0,ভৈ,505,0,756
115625,Train_115625,29,0,0,ঘ,215,0,148


In [50]:
valid.head(5)

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_id,unseen,unique_label
99048,Train_99048,109,0,0,ব্দ,34,0,715
145964,Train_145964,122,10,0,যৌ,1108,0,834
56761,Train_56761,64,8,5,ত্রৈ,252,0,423
131816,Train_131816,79,0,2,র্ধ,645,0,932
8660,Train_8660,69,7,0,ত্বে,1060,0,405


In [52]:
valid.tail(5)

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,grapheme_id,unseen,unique_label
200564,Train_200564,109,3,0,ব্দী,18,1,718
200578,Train_200578,27,0,0,গ্ম,246,1,129
200769,Train_200769,74,10,0,দ্দৌ,495,1,459
200794,Train_200794,74,10,0,দ্দৌ,495,1,459
200829,Train_200829,81,4,0,নু,786,1,522


In [53]:
out_train = train.set_index("image_id")
out_valid = valid.set_index("image_id")
out_train.to_csv(ROOT_PATH / "input" / "bengaliai-cv19" / "moco_train_unseen.csv")
out_valid.to_csv(ROOT_PATH / "input" / "bengaliai-cv19" / "moco_valid_unseen.csv")