# Outline

## North

- ลบ note หรือ comment กำกับข้อมูลใน column 4-5
- ใช้ regex ลบ ดู, ก็ว่า, ก็เรียก

## South

- มีการใช้เลขไทย [๑., ๑).] เพื่อแสดง POS ที่หลากหลาย
- "อาลปนะ."
- "แสลง." instead of "POS"

## Isan

- change from "สรรพ." to "ส."
- change from "สำ." to "สำนวน."
- change from "ลัก." to "น."
- change from "อุ" to "อ."
- remove any note or comment in 5th column
- clear the first row

## Central

# Main Program

## Load the Data

In [None]:
# downloading packages

# !pip3 install numpy
# !pip3 install pandas
# !pip3 install csv
# !pip3 install openpyxl

In [None]:
# loading the data

import pandas as pd
import numpy as np
import os
import re

north_path = r"datasets\พจนานุกรมภาษาถิ่นเหนือ - Copy.xlsx"
isan_path = r"datasets\พจนานุกรมภาษาถิ่นอีสาน - Copy.xlsx"
south_path = r"datasets\พจนานุกรมภาษาถิ่นใต้ - Copy.xlsx"
orst_path = r"datasets\ราชบัณฑิตฯ - Copy"

def load_sheets(file_path, exclude_sheets):
    all_sheets = pd.ExcelFile(file_path).sheet_names
    importing_sheets = [sheet for sheet in all_sheets if sheet not in exclude_sheets]
    return pd.read_excel(file_path, sheet_name=importing_sheets, header=None)

north_df = pd.read_excel(north_path, sheet_name=None, header=None)
isan_df = pd.read_excel(isan_path, sheet_name=None, header=None)
south_df = pd.read_excel(south_path, sheet_name=None, header=None)

def load_excel(directory):
    excel_files = {}
    for filename in os.listdir(directory):
        name = os.path.splitext(filename)[0]
        excel_files[name] = pd.read_excel(os.path.join(directory, filename), header=None)
    return excel_files

orst_df = load_excel(orst_path)

## Inspect the Data

In [None]:
# run to copy

north = north_df.copy()
isan = isan_df.copy()
south = south_df.copy()
orst = orst_df.copy()

In [None]:
# inspect the data

def inspect(num=10, *dfs):
  for df in dfs:
    for key in df.keys():
      print(f"{key}\n{df[key].head(num)}\n")

inspect(5, north, south, isan, orst)

## Cleaning

In [None]:
# cleaning
def shift_left(row):
    values = [val for val in row if pd.notna(val)]
    return pd.Series(values + [None] * (len(row) - len(values)))

def cleaning(*dfs):
    for df in dfs:
        removed = [
            'ช่อง 1', 'ลูกคำ/ ความหมาย', 'แม่คำ', 'ความหมายลูกคำ คำสื่อ',
            'headword', '2', '3']

        for key in df.keys():
            if df == north or df == south or df == isan:
                df[key] = df[key][~df[key].apply(lambda row: any(string in str(val) for string in removed for val in row), axis=1)]
                df[key] = df[key].dropna(how='all')
                df[key] = df[key].drop(df[key].columns[2:], axis=1)
                df[key] = df[key].fillna("")
                df[key].columns = ['word', 'meaning']

            elif df == orst:
                df[key] = df[key].iloc[:, :-1]
                df[key].iloc[:, 0] = df[key].apply(lambda row: row.iloc[1] if pd.isnull(row.iloc[0]) else row.iloc[0], axis=1)
                df[key] = df[key].drop(columns=[df[key].columns[1]])
                df[key] = df[key].reset_index(drop=True)
                df[key].columns = ['word', 'meaning']
                df[key] = df[key].iloc[1:]

cleaning(north, isan, south, orst)

inspect(10, north, isan, south, orst)

In [None]:
def func(df):
    for key in df.keys():
        # print(df[key].columns)
        print(df[key].shape)

func(orst)

# orst["หมวด_ฬ(1)"]

## Split POS

In [None]:
def rearrange(df):
  for key in df.keys():
    pattern = r"^(น\.|\*น\.|นิ\.|ก\.|\*ก\.|ว\.|\*ว\.|สัน\.|ส\.|บุรพ\.|\*ส\.|สำ\.|สำนวน\.|สำนวน\.\*|คำลงท้าย\.|คำยกย่อง\.|อาลปนะ\.|ลัก\.|สรรพ\.|อ\.|\*อ\.|อุ\.|ความเปรียบ\.|คำถาม\.|บุพ\.|บ\.)(.*)$"
    
    df[key]["คำ"] = df[key]['word']
    df[key]["POS"] = ""
    df[key]["ความหมาย"] = ""

    if df[key]['meaning'].str.startswith((
      "น.", "*น.", "นิ.", "ก.", "*ก.", "ว.", "*ว.", "สัน.", "สำ.", "ส.", "*ส.", "ความเปรียบ.", "คำถาม.", "คำยกย่อง.",
      "บุพ.", "บ.", "บุรพ.", "สำนวน.", "สำนวน.*", "ลัก.", "สรรพ.", "อ.", "*อ.", "อุ.", "อุทาน.", "คำลงท้าย.")).any():
    
      df[key][["POS", "ความหมาย"]] = df[key]['meaning'].str.extract(pattern, expand=True)
      df[key]["ความหมาย"] = df[key].apply(lambda row: row[2] if row["POS"] == "" else row["ความหมาย"], axis=1)

    # fill no POS
    pos_mask = df[key]['meaning'].str.match(pattern, na=False)
    df[key]["POS"] = df[key]["POS"].fillna("")
    df[key].loc[~pos_mask, "ความหมาย"] = df[key].loc[~pos_mask, 'meaning']

    # remove first two columns
    df[key] = df[key].iloc[:, 2:]

rearrange(orst)
rearrange(north)
rearrange(south)
rearrange(isan)

def clean_whitespace(df):
  for key in df.keys():
    for column in df[key].columns:
      if df[key][column].dtype == 'object':
        df[key][column] = df[key][column].apply(
          lambda text: ' '.join(text.split()) if isinstance(text, str) else text
        )
  return df

orst = clean_whitespace(orst)
north = clean_whitespace(north)
south = clean_whitespace(south)
isan = clean_whitespace(isan)

inspect(10, north, isan, south, orst)

# orst['หมวด_ก(3881)'].head(n=40)
# north['ก'].head(n=30)

In [None]:
from pythainlp.util import collate
def fill_blank_words(iter=2, *dfs):
    for df in dfs:
        for _ in range(iter):
            for key in df.keys():
                df[key] = df[key].dropna(how='all')
                df[key] = df[key].infer_objects(copy=False)
                df[key] = df[key].replace(r'^\s*$', np.nan, regex=True) 
                df[key] = df[key].ffill()
                df[key].drop_duplicates(inplace=True)
                df[key] = df[key].sort_values(by="คำ")
                df[key]["คำ"] = collate(df[key]["คำ"])
                df[key] = df[key].reset_index(drop=True)
        return df

fill_blank_words(2, north, isan, south, orst)

inspect(10, north, isan, south, orst)

## Output

In [None]:
# output

def df_to_csv(dfs, region, path):
    for sheet, df in dfs.items():
        file = f"{path}/{region}_{sheet}.csv"
        df.to_csv(file, index=False, encoding='utf-8-sig')

regions = {
    'north': north,
    'isan': isan,
    'south': south,
    'cen': orst
    }

for region, dfs in regions.items():
    df_to_csv(dfs, region, r"output\csv_test1")

# Test