# Cleaning Volubilis data
Website: https://belisan-volubilis.blogspot.com/   
Download link: https://sourceforge.net/projects/belisan/files/VOLUBILIS.ods/download  

In [1]:
import pandas as pd
from tqdm import tqdm
from pythainlp.util import isthaichar

## import data

SUPER LONG!!

Recomended to comment out this cell if runned once.

If updating data, delete "en_th_dictionary.csv" file and run this again

In [None]:
df_original = pd.read_excel("VOLUBILIS.ods", engine="odf")

# extract only Thai and English
th = df_original["THA"][15:]
en = df_original["ENG"][15:]

df = pd.DataFrame([th, en]).T
df.to_csv("en_th_dictionary.csv", index=False)

In [2]:
# for handling skipping
# don't comment out this one

df = pd.read_csv("en_th_dictionary.csv")
df['THA'] = df['THA'].astype(str)
df['ENG'] = df['ENG'].astype(str)
df

Unnamed: 0,THA,ENG
0,อ-,un- (pref.)
1,อะ,[final particle]
2,อ่ะ,[expression of resignation]
3,อ๊ะ,What! ; Oh! ; Eh!
4,อา,uncle ; aunt ; father's younger brother ; fath...
...,...,...
116418,สน. (สกลนคร),Sakon Nakhon Province
116419,ตปท. (ต่างประเทศ),abroad ; overseas
116420,"ทบ. (กองทัพบก, ทหารบก)",Royal Thai Army
116421,ตร.,police


In [5]:
df.shape

(115675, 2)

## Clean

### rid NA

In [3]:
#rid NA
df = df[df['THA'] != 'nan']
df.shape

(116422, 2)

### rid -
prefix words has "-" at last

In [4]:
df = df[~df['THA'].str.endswith('-')]
df.shape

(115675, 2)

### Total cleaning
look one by one

In [6]:
th_new = []
en_new = []
for cnt in tqdm(range(df.shape[0])):
    th = df.iloc[cnt]["THA"]
    en = df.iloc[cnt]["ENG"]
    
    if en[0] == "[":
        continue
    
    ths = [th]
    
    _cnt = 0
    #repeat until all words checked
    while _cnt < len(ths):
        th = ths[_cnt]
        
        #seperate ; to 2 words
        if ";" in th:
            del ths[_cnt]
        
            ths += th.split(";")
        
            continue
        
        #seperate to 2 words
        if "=" in th:
            del ths[_cnt]
            
            ths += th.split("=")
        
            continue
            
        # seperate word inside ()
        if "("in th:
            del ths[_cnt]
            
            sp = th.split("(")
            
            ths.append(sp[0])
            ths.append(sp[1].split(")")[0])
            
            continue
            
        # seperate , to 2 words
        if "," in th:            
            del ths[_cnt]
            
            sp = th.split(",")
            
            ths.append(sp[0])
            ths.append(sp[1])
            
            continue
        
        # all conditions checked for this word
        _cnt += 1
    
    #final cleaning
    for th in ths:
        th = th.replace("…", "")
        th = th.replace(" ", "")
        th = th.replace("[", "")
        th = th.replace("]", "")
        
        th_new.append(th)
        en_new.append(en)
        
df_new = pd.DataFrame({"THA":th_new, "ENG":en_new})
df_new

100%|██████████| 115675/115675 [00:18<00:00, 6096.72it/s]


Unnamed: 0,THA,ENG
0,อ๊ะ,What! ; Oh! ; Eh!
1,อา,uncle ; aunt ; father's younger brother ; fath...
2,อา,Ah! ; Hmm ; Oh!
3,อา,?
4,อ๋า,?
...,...,...
121116,ต่างประเทศ,abroad ; overseas
121117,ทบ.,Royal Thai Army
121118,กองทัพบก,Royal Thai Army
121119,ทหารบก,Royal Thai Army


### Save data

In [None]:
with open("wiki_titles.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(list(ths)))