### Vectorize Data Sets

In [1]:
from numba import jit, autojit # accelerate for loops
import numpy as np
import numpy.random as npr
import re # regular expressions
import glob
import pandas as pd

In [9]:
###--- Methods ---###

#### bootstrapping: resampling (virtual example)

nb = 500  # how many samples ?
ln = 3000 # how many words for each sample ? (3000 ~ length of each text)

def btp_ori(tot,nb,ln):
    if len(tot) == 0: return np.empty(0)
    else:
        idx = npr.randint(0, len(tot), (nb,ln)) # randomly pick indices from the 'at' array, 'nb' rows
        return np.array(tot[idx])

#### extract function words
# function word list: Bei Yu 2012: Function Words for Chinese Authorship Attribution
fw1 = ["的","是","不","了","在","有","這","為","地","也","得","就","那","以","著","之","可","于"
       "麼","而","然","没","於","還","只","無","又","如","但","其","此","與","把","全","被","卻"]

# difficult word list: 司法院裁判書類通俗化小組
fw2 = ["殆","似","難認無過失","難認有理由","尚難謂為於法無違","非無理由","非有理由","尚非無疑","尚非不可能",
       "可徵","徵諸","洵堪","迭於","係經","係指","洵屬","伊","渠","致","爰","遽認","矧","略以","按","第按","是","茲",
       "揆諸","皆無該條適用","即","無庸","縱",
       "上開","足資","可稽","堪以認定","業據","云云","等語","之","次查","復查","再查","系爭","固非無見","旋","惟查","均係"]
fwt = fw1+fw2

@jit
def ext_fw(txt,fw):
    # txt = input texts; fw = function word list
    j = np.empty(0)
    for i in fw:
        l = re.findall(i,txt)
        j = np.concatenate((j,np.array(l)))
    return j

In [10]:
#### generate csv files
def fw_csv(path):
    list_of_files = glob.glob(path)
    
    at = np.empty(0)
    for file_n in list_of_files:
        with open(file_n,'r') as myfile:         
            sl = myfile.read().replace('\n','')   # returns a string
            sa = np.array(re.findall(r'\S+', sl)) # returns an array of the above string
            
            # all texts as one
            at = np.concatenate((at,sa))
        
    btp = btp_ori(at,nb,ln) # virtual examples: # = nb, length = ln
    
    for i in range(btp.shape[0]):
        bts = " ".join(btp[i])
        fwr = ext_fw(bts,fw1+fw2)
        
        # frequency table
        fwr_str = "\t".join(fwr).split("\t")
        fwr_tb  = pd.Series(fwr_str).value_counts()
        
        # store in dictionary
        dict1 = {}
        dict1.update(fwr_tb)
        dict1.update({"NAME": "%s" %f})

        tp.append(dict1)
        
    return tp
    

In [11]:
#### read corpus
list_of_jud   = ["翁岳生","城仲模","林永謀","王和雄","余雪明","廖義男","曾有田","楊仁壽",
                 "彭鳳至","賴英照","謝在全","徐璧湖","林子儀","許宗力","許玉秀","林錫堯",
                 "池啟明","李震山","蔡清遊","賴浩敏","蘇永欽","黃茂榮","陳新民","陳春生","陳敏","葉百修"]

tp = [] # an empty list
for f in list_of_jud:
    # for each author ...
    path = './segmented_txt_2/%s/*.txt' %f 
    rs   = fw_csv(path)
    
df = pd.DataFrame(rs) 
# output csv
df.to_csv("./vectorized.csv", encoding="utf-8")

---
### Examine results:

In [12]:
import pandas as pd
test = pd.read_csv("./vectorized.csv", encoding="utf-8")
test

Unnamed: 0.1,Unnamed: 0,NAME,上開,不,之,也,了,云云,以,伊,...,致,與,茲,著,被,足資,這,遽認,還,那
0,0,翁岳生,4,40,384,,,,47,,...,4,37,,2,3,,,,,
1,1,翁岳生,1,45,356,,,,55,,...,4,45,,5,1,,,,,
2,2,翁岳生,3,48,360,,,,40,,...,2,34,,2,1,,,,,
3,3,翁岳生,,42,372,,,,51,,...,,29,,,1,,,,,
4,4,翁岳生,4,50,360,,,,42,,...,6,35,,2,1,,,,,
5,5,翁岳生,,52,314,,,,50,,...,4,35,,3,1,,,,,
6,6,翁岳生,1,51,344,,,,51,,...,3,37,,4,2,,,,,
7,7,翁岳生,4,42,346,,,,49,,...,5,32,,2,,,,,,
8,8,翁岳生,,51,360,,,,39,,...,6,40,,5,,,,,,
9,9,翁岳生,,63,348,,,,55,,...,3,37,,1,1,,,,,


---
### Generate testing data from judicial Yuan (cf. parser.ipynb)

In [13]:
tp = [] # an empty list
path = './test_d/*.txt'
rs   = fw_csv(path)
    
df = pd.DataFrame(rs) 
# output csv
df.to_csv("./test_ds.csv", encoding="utf-8")