### Vectorize Data Sets

In [5]:
from numba import jit, autojit # accelerate for loops
import numpy as np
import numpy.random as npr
import re # regular expressions
import glob
import pandas as pd

In [6]:
###--- Methods ---###

#### bootstrapping: resampling (virtual example)

nb = 500  # how many samples ?
ln = 3000 # how many words for each sample ? (3000 ~ length of each text)

def btp_ori(tot,nb,ln):
    idx = npr.randint(0, len(tot), (nb,ln)) # randomly pick indices from the 'at' array, 'nb' rows
    return np.array(tot[idx])

#### extract function words
# function word list: Bei Yu 2012: Function Words for Chinese Authorship Attribution
fw1 = ["的","是","不","了","在","有","這","為","地","也","得","就","那","以","著","之","可","于"
       "麼","而","然","没","於","還","只","無","又","如","但","其","此","與","把","全","被","卻"]

# difficult word list: 司法院裁判書類通俗化小組
fw2 = ["殆","似","難認無過失","難認有理由","尚難謂為於法無違","非無理由","非有理由","尚非無疑","尚非不可能",
       "可徵","徵諸","洵堪","迭於","係經","係指","洵屬","伊","渠","致","爰","遽認","矧","略以","按","第按","是","茲",
       "揆諸","皆無該條適用","即","無庸","縱",
       "上開","足資","可稽","堪以認定","業據","云云","等語","之","次查","復查","再查","系爭","固非無見","旋","惟查","均係"]

@jit
def ext_fw(txt,fw):
    # txt = input texts; fw = function word list
    j = np.empty(0)
    for i in fw:
        l = re.findall(i,txt)
        j = np.concatenate((j,np.array(l)))
    return j

In [8]:
#### read corpus
list_of_jud   = ["翁岳生","城仲模","林永謀","王和雄","余雪明","廖義男","曾有田","楊仁壽",
                 "彭鳳至","賴英照","謝在全","徐璧湖","林子儀","許宗力","許玉秀","林錫堯",
                 "池啟明","李震山","蔡清遊","賴浩敏","蘇永欽","黃茂榮","陳新民","陳春生","陳敏","葉百修"]
for f in list_of_jud:
    # for each author ...
    list_of_files = glob.glob('./segmented_txt_2/%s/*.txt' %f )
    
    for file_n in list_of_files:
        with open(file_n,'r') as myfile:
            st = ""
            at = np.empty(0)
            
            sl = myfile.read().replace('\n','')   # returns a string
            sa = np.array(re.findall(r'\S+', sl)) # returns an array of the above string
            
            # all texts as one
            at = np.concatenate((at,sa))
        
    btp = btp_ori(at,nb,ln) # virtual examples: # = nb, length = ln

    #with open("%s.csv" %f,"w") as fout:
    #    for i in range(btp.shape[0]):
    #        bts = " ".join(btp[i]) # convert to a single string for 'ext_fw'
    #        fwr = ext_fw(bts,fw1+fw2) # default using BOTH function word list AND difficult words
            
    #        fout.write(fwr)
    
    tp = pd.DataFrame() # an empty dataframe
    for i in range(btp.shape[0]):
        bts = " ".join(btp[i])
        fwr = ext_fw(bts,fw1+fw2)
        
        tp = tp.append(pd.DataFrame.transpose(pd.DataFrame(fwr)))
    
    tp.to_csv("./vectorized/%s.csv" %f, encoding="utf-8")

In [9]:
## examine results: example
import pandas as pd
test = pd.read_csv("./vectorized/余雪明.csv", encoding="utf-8")
test

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,1098,1099,1100,1101,1102,1103,1104,1105,1106,1107
0,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
1,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
2,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
3,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
4,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
5,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
6,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
7,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
8,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
9,0,的,的,的,的,的,的,的,的,的,...,,,,,,,,,,
