In [None]:
from pathlib import Path
import pandas as pd
import sys
import os
import re
from ydata_profiling import ProfileReport
import ipywidgets as widgets
projRoot = Path.cwd().parents[0]
root_stage1 = projRoot.joinpath('data/stage1')
root_stage1.mkdir(exist_ok=True)
root_rawdata = projRoot.joinpath('data/raw')

### 資料清理工作

先使用`notebooks/view_stage0.ipynb`檢查raw data

執行清理語法，執行完畢後，再用下方功能檢視結果。 

#### 統一執行

In [None]:
def import_data(dt_path):
    df = pd.read_csv(dt_path, 
                     dtype={'學校統計處代碼': 'string', '系所代碼': 'string'},
                     na_values=['...'])
    return df

def export_data(dataframe, stage1_path):
    dataframe.to_csv(stage1_path, index=False, encoding='utf8')
    print(f'{stage1_path} 輸出成功～')

def RoctoCE(dataframe):
    # 選出有年度的欄位
    flt = dataframe.columns[dataframe.columns.str.contains('年度$')]
    dataframe[flt] = dataframe[flt].apply(lambda yr: yr + 1911) 
    return(dataframe)

# def cleanDot(dataframe, cols):
#     dataframe[cols] = dataframe[cols].apply()

#### Stage1 資料輸出

In [None]:
def funct1(dataset_names):
    for fnm in dataset_names:
        tbnm = fnm
        pth = root_rawdata.joinpath(tbnm)
        df = import_data(pth)
        df = df.pipe(RoctoCE)
        outpath = root_stage1.joinpath(tbnm)
        export_data(df, outpath)

# NOTE: 應該用排除的方式來寫比較有效率      
dfs = [re.search(r'.+raw/(.+)', str(f)).group(1) for f in root_rawdata.rglob('*.csv')]

funct1(dfs)

### 資料表檢索

In [None]:
filepaths = sorted(root_stage1.rglob('*.csv'))

In [None]:
options_cate = list(set(fp.name[0] for fp in filepaths)) # 保留單一值

cate_togl =  widgets.ToggleButtons(
    options=options_cate,
    description='選取類別:',
    disabled=False
)
cate_togl

In [None]:
options_lt = [fp.name for fp in sorted(root_stage1.rglob(f'{cate_togl.value}*'))]

dropdwn_dt = widgets.Dropdown(
    options=options_lt,
    # value='',
    description='請點選資料表:'
    # disabled=False,
)
dropdwn_dt

In [None]:
df = pd.read_csv(root_stage1.joinpath(dropdwn_dt.value), 
                 dtype={'學校統計處代碼': 'string', '系所代碼': 'string'}
                 )
profile = ProfileReport(df, title="Pandas Profiling Report", minimal=True)
profile.to_notebook_iframe()

In [None]:
import pyperclip
pyperclip.copy(f'"{dropdwn_dt.value}"')