### Vectorize Data Sets

In [2]:
from numba import jit, autojit # accelerate for loops
import numpy as np
import numpy.random as npr
import re # regular expressions
import glob
import pandas as pd

In [11]:
###--- Methods ---###

#### bootstrapping: resampling (virtual example)

nb = 100  # how many samples ?
ln = 3000 # how many words for each sample ? (3000 ~ length of each text)

def btp_ori(tot,nb,ln):
    if len(tot) == 0: return np.empty(0)
    else:
        idx = npr.randint(0, len(tot), (nb,ln)) # randomly pick indices from the 'at' array, 'nb' rows
        return np.array(tot[idx])

#### extract function words
# function word list: Bei Yu 2012: Function Words for Chinese Authorship Attribution
fw1 = ["的","是","不","了","在","有","這","為","地","也","得","就","那","以","著","之","可","于"
       "麼","而","然","没","於","還","只","無","又","如","但","其","此","與","把","全","被","卻"]

# difficult word list: 司法院裁判書類通俗化小組
fw2 = ["殆","似","難認無過失","難認有理由","尚難謂為於法無違","非無理由","非有理由","尚非無疑","尚非不可能",
       "可徵","徵諸","洵堪","迭於","係經","係指","洵屬","伊","渠","致","爰","遽認","矧","略以","按","第按","茲",
       "揆諸","皆無該條適用","即","無庸","縱",
       "上開","足資","可稽","堪以認定","業據","云云","等語","次查","復查","再查","系爭","固非無見","旋","惟查","均係"]
fwt = fw1+fw2

@jit
def ext_fw(txt,fw):
    # txt = input texts; fw = function word list
    j = np.empty(0)
    for i in fw:
        l = re.findall(i,txt)
        j = np.concatenate((j,np.array(l)))
    return j

In [12]:
#### generate csv files
global file_idx
file_idx = 0
def fw_csv(path,typ):
    list_of_files = glob.glob(path)

    for file_n in list_of_files:
        with open(file_n,'r') as myfile:         
            sl = myfile.read().replace('\n','')   # returns a string
            at = np.array(re.findall(r'\S+', sl)) # returns an array of the above string
            
            ## all texts as one
            # at = np.concatenate((at,sa))
        
        if   typ == 0: btp = btp_ori(at,nb,ln) # virtual examples: # = nb, length = ln
        elif typ == 1: btp = np.array(at,ndmin=2) # for test data
            
        for i in range(btp.shape[0]):
            bts = " ".join(btp[i])
            fwr = ext_fw(bts,fw1+fw2)

            # frequency table
            fwr_str = "\t".join(fwr).split("\t")
            fwr_tb  = pd.Series(fwr_str).value_counts()

            # store in dictionary
            dict1 = {}
            dict1.update(fwr_tb)
            dict1.update({"NAME": "%s" %f})
            dict1.update({"file_n": file_idx})
            dict1.update({"total wc": btp[i].shape[0]}) # word count
            tp.append(dict1)
        global file_idx
        file_idx+=1
       
    return tp

In [13]:
#### Orderring columns
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

def reord_col(df):
    for a in fwt:
        if a not in [x.encode('utf-8') for x in df.columns]:
            df = df.append(pd.DataFrame(columns=[a]))

    col = ["NAME","file_n","total wc"]+fwt
    df = df[col]
    return df

In [14]:
#########################
###-- Training Data --###
#########################

#### read corpus
list_of_jud   = ["翁岳生","城仲模","林永謀","王和雄","余雪明","廖義男","曾有田","楊仁壽",
                 "彭鳳至","賴英照","謝在全","徐璧湖","林子儀","許宗力","許玉秀","林錫堯",
                 "池啟明","李震山","蔡清遊","賴浩敏","蘇永欽","黃茂榮","陳新民","陳春生","陳敏","葉百修"]

tp = [] # an empty list
file_idx = 0
for f in list_of_jud:
    # for each author ...
    path = './segmented_txt_2/%s/*.txt' %f 
    rs   = fw_csv(path,0)
    
df = pd.DataFrame(rs) 
df = reord_col(df)
# output csv
df.to_csv("./vectorized.csv", encoding="utf-8")

---
### Examine results:

In [15]:
import pandas as pd
test = pd.read_csv("./vectorized.csv", encoding="utf-8")
test

Unnamed: 0.1,Unnamed: 0,NAME,file_n,total wc,的,是,不,了,在,有,...,云云,等語,次查,復查,再查,系爭,固非無見,旋,惟查,均係
0,0,翁岳生,0,3000,17,11,46,,19,67,...,,,,,,4,,,,
1,1,翁岳生,0,3000,22,11,50,,18,68,...,,,,,,3,,,,
2,2,翁岳生,0,3000,17,9,32,,13,69,...,,,,,,6,,,,
3,3,翁岳生,0,3000,23,11,55,,15,56,...,,,,,,2,,,,
4,4,翁岳生,0,3000,20,14,50,,25,67,...,,,,,,4,,,,
5,5,翁岳生,0,3000,17,15,46,,15,75,...,,,,,,4,,,,
6,6,翁岳生,0,3000,21,5,44,,20,63,...,,,,,,6,,,,
7,7,翁岳生,0,3000,15,9,46,,12,71,...,,,,,,6,,,,
8,8,翁岳生,0,3000,18,11,47,,18,84,...,,,,,,5,,,,
9,9,翁岳生,0,3000,24,8,52,,18,58,...,,,,,,2,,,,


---
### Generate testing data from judicial Yuan (cf. parser.ipynb)

In [16]:
#####################
###-- Test Data --###
#####################

file_idx = 573 ## interpretation label
tp = [] # an empty list
path = './test_d/*.txt'
f    = "test data"
rs   = fw_csv(path,1)
    
df = pd.DataFrame(rs)
df = reord_col(df)

# output csv
df.to_csv("./test_ds.csv", encoding="utf-8")

In [17]:
import pandas as pd
test = pd.read_csv("./test_ds.csv", encoding="utf-8")
test

Unnamed: 0.1,Unnamed: 0,NAME,file_n,total wc,的,是,不,了,在,有,...,云云,等語,次查,復查,再查,系爭,固非無見,旋,惟查,均係
0,0,test data,573,674,1,2,8,,4,13,...,,,,,,1,,,,
1,1,test data,574,559,2,3,9,,3,8,...,,,,,,,,,,
2,2,test data,575,513,1,1,9,,4,8,...,,,,,,,,,,
3,3,test data,576,388,2,2,16,,2,7,...,,,,,,1,,,,
4,4,test data,577,554,2,2,10,1,2,10,...,,,,,,,,,,
5,5,test data,578,544,2,,4,,3,11,...,,,,,,,,,,
6,6,test data,579,472,1,2,7,,3,6,...,,,,,,1,,,,
7,7,test data,580,759,1,2,13,,3,15,...,,,,,,,,,,
8,8,test data,581,252,,,8,,4,6,...,,,,,,1,,,,
9,9,test data,582,913,2,1,15,,5,16,...,,1,,,,2,,,,
