# Data Preprocessing
將原始的資料做前處理，最後輸出成 pickle 檔供後續 model、分析使用。

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
import re

import pickle

from tqdm import tqdm

path: raw data 存放的位置

savepath: 處理好的資料儲存檔名

In [2]:
path = os.path.join("..","data","./newdata_clean.xlsx")
n_cpu = 4
batch_size = 10000

savepath = "processed_data"

## Read Data
使用 catName 當作資料的 label，可以用來做 supervised 的訓練。

In [3]:
df = pd.read_excel(path)

df = df.dropna() # drop nan entry
# df[pd.isnull(df).any(axis=1)]

le = preprocessing.LabelEncoder()
le.fit(df['catName'].unique())
num_classes = len(le.classes_)
class_list = list(le.classes_)

#print(class_list)

print("number of classes:",num_classes)
df.loc[:,'catName'] = le.transform(df.loc[:,'catName'])
data = df.question

number of classes: 64


## Preprocess function
下方的 function 為前處理不同的項目，可以視情況選用，像是 bert 我就沒有使用下方 4, 6, 7, 以及部分 `_filter()` 內容。
1. `_expandContractions()`:
    會根據 cList 的內容將出現在句子中的縮寫展開。
2. `_removeRedundant`:
    因為原始的資料來自數個產品，往往會夾帶許多不一樣但是制式的內容，因此這邊利用 regular expression 找到會將之濾掉。
3. `_filter()`:
    這個 function 會把像是網址、上傳檔案等冗於資訊濾掉，同時因為對於像是 tfidf 是對不同字做處理，所以把像是 "power dvd" 合成一個專有名詞 "powerdvd"。
4. `_correct_word()`:
    這個 function 會嘗試把打錯的字校正，然而速度非常慢！
6. `_lemmatization()`:
    這個 function 會把動詞的三態還原、複數名詞轉成單數等
    它的原理是先用 `_get_wordnet_pos()` 將字做詞性的分類，之後再根據詞性去掉字尾。
7. `_remove_stopword()`:
    這個 function 會以 nltk 預設的 english 為基準去掉句子中的 stopword。可是因為並不是非常足夠，所以有加上我自己觀察資料所挑出的 stopword，以及人名的 list。

In [4]:
cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))
def _expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

In [5]:
def _removeRedundant(x):
    log = []
    redundant_pos = re.search(r"dear\s+valued\s+customer,", x)
    if redundant_pos != None:
        log.append("cut dear valued customer,")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"order\s+confirmation\s+order\s+number:", x)
    if redundant_pos != None:
        log.append("cut order confirmation order number:")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"<li>comment:</li>", x)
    if redundant_pos != None:
        log.append("cut <li>comment:</li>")
        x = x[redundant_pos.end():]
    redundant_pos = re.search(r"time\s+of\s+this\s+report:", x)
    if redundant_pos != None:
        log.append("cut time of this report:")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"this\s+correspondence\s+is\s+from", x)
    if redundant_pos != None:
        log.append("cut this correspondence is from")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"forwarded\s+message", x)
    if redundant_pos != None:
        log.append("cut forwarded message")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"(best)?(kind)?\s+regards", x)
    if redundant_pos != None:
        log.append("cut best|kind regards")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"media\s+source\s+error\s+report", x)
    if redundant_pos != None:
        log.append("cut Media Source Error Report")
        x = x[:redundant_pos.start()]
    redundant_pos = re.search(r"<br><br>Attach\s+File\s+:", x)
    if redundant_pos != None:
        log.append("cut Attach File")
        x = x[:redundant_pos.start()]
    #print("\n".join(log))
    return x


def _filter(x):      
    x = re.sub(r'<[^<]*?/?>', ' ', x)                              # remove all html tag
    x = re.sub(r"\w{5}(-\w{5}){5}", "[KEY]", x)                    # replace key tokex, ex: sd2kk-33j7v-na4m2-m8n5s-sjaxq-bramm
    x = re.sub(r'https?:\/\/[^ ]*', ' ', x)                        # remove all url
    x = re.sub(r'\S*@\S*\s?', ' ', x)                              # remove all email address
    x = re.sub(r'\S*\.\S*\s?', ' ', x, flags=re.IGNORECASE)        # remove all filename
    #x = re.sub(r'[^a-z A-Z]', ' ', x)                             # remove all non-english alphabat
    '''
    x = re.sub(r"power\s+dvd", "powerdvd", x, flags=re.IGNORECASE)
    x = re.sub(r"power\s+director", "powerdirector", x, flags=re.IGNORECASE)
    x = re.sub(r"audio\s+director", "audiodirector", x, flags=re.IGNORECASE)
    x = re.sub(r"color\s+director ", "colordirector ", x, flags=re.IGNORECASE)
    x = re.sub(r"action\s+director", "actiondirector", x, flags=re.IGNORECASE)
    x = re.sub(r"makeup\s+director ", "makeupdirector ", x, flags=re.IGNORECASE)
    x = re.sub(r"director\s+suite", "directorsuite", x, flags=re.IGNORECASE)
    x = re.sub(r"audio\s+director", "audiodirector", x, flags=re.IGNORECASE)
    x = re.sub(r"photo\s+director", "photodirector ", x, flags=re.IGNORECASE)
    x = re.sub(r"blue?[-\s]*rays?", "bluray", x, flags=re.IGNORECASE)
    x = re.sub(r"power\s*(2|(to))\s*go", "power2go", x, flags=re.IGNORECASE)
    x = re.sub(r"cyber\s+link", "cyberlink", x, flags=re.IGNORECASE)
    x = re.sub(r"pdr", "powerdirector", x, flags=re.IGNORECASE)
    x = re.sub(r"pdvd", "powerdvd", x, flags=re.IGNORECASE)
    x = re.sub(r"pls", "please", x, flags=re.IGNORECASE)
    x = re.sub(r"add[-\s]*ons?", "addon", x, flags=re.IGNORECASE)
    x = re.sub(r"media[-\s]*suite", "mediasuite", x, flags=re.IGNORECASE)
    '''
    return x

In [6]:
def _correct_word(text1):
    pattern = re.compile(r"(.)\1{2,}")
    text2 = pattern.sub(r"\1\1", text1) # reduce lengthening
    #if text1 != text2:
    #    print(text1, text2)
    text3 = spell(text2).lower() # spell correction
    #if text2 != text3:
    #    print(text2, text3)
    return text3

In [7]:
import nltk
nltk.download('punkt')
from nltk.corpus import wordnet
wnl = nltk.stem.WordNetLemmatizer()

def _get_wordnet_pos(tag):
    if tag =='J':
        return wordnet.ADJ
    elif tag =='V':
        return wordnet.VERB
    elif tag =='N':
        return wordnet.NOUN
    elif tag =='R':
        return wordnet.ADV
    else:
        return None

def _lemmatization(tokens):
    tagged_sent = nltk.pos_tag(tokens) # [('The', 'DT'), ('striped', 'JJ'), ('for', 'IN'), ('best', 'JJS')]
    ret = []
    for tag in tagged_sent:
        wordnet_pos = _get_wordnet_pos(tag[1][0]) or wordnet.NOUN
        ret.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
        #print(tag[0],tag[1][0],wordnet_pos,ret[-1])
    return ret

[nltk_data] Downloading package punkt to
[nltk_data]     /home/student/05/b05505004/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
nltk.download('stopwords')

namelist = ["paul", "jim", "larry", "ken", "wright", "peter", "donna", "ian", "rick", "richard", "william", "john", "chris", "tony", "joseph"]
stw = nltk.corpus.stopwords.words('english') + ["nbsp", "would", "cant", "hey", "quot", "dont", "cyberlink", "guy", "wont", "didnt", "doesnt"]
print(stw)
print("Stopwords length: {}".format(len(stw)))
def _remove_stopword(tokens):
    ret = []
    for word in tokens:
        if word in namelist:
            print("skip name {}".format(word))
            continue
        if word not in stw and len(word) > 2:
            ret.append(word)
    return ret

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/05/b05505004/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


下方的區塊正式會將資料做前處理，因為資料過多，所以有使用 multi-process 加速。
1. 因為資料裡混有像是法文、韓文等非英文語言，所以使用 `langid` 套件把非英文語言濾掉
2. 也有一個 `if` 可以濾掉過短句子


* `clean_data`: list 裡面的內容是原始 data 的句子。換句話說沒有經過任何預處理，僅濾掉過短、非英文的句子。
* `reduced_data`: list 裡的內容是 clean_data 的句子經過預處理。也就是根據選擇的 function 去掉 stopword、還原三態...。
* `token_data`: list 裡的內容是 reduced_data 的句子以空白 token 過的 list。

In [9]:
def preprocess(sentence):
    sentence = sentence.lower()
    sentence = _removeRedundant(sentence)
    sentence = _expandContractions(sentence)
    sentence = _filter(sentence)
    tokens = nltk.word_tokenize(sentence)

    #tokens = [self._correct_word(word) for word in tokens] # spell correction
    #tokens = _lemmatization(tokens) # lemmatization
    #tokens = _remove_stopword(tokens) # remove stopwords
    #s = " ".join(tokens)
    s = sentence ### for BERT ONLY
    return s, tokens


import langid
def process_batch(batch):   
    clean_batch = []
    reduced_batch = []
    token_batch = []
    for ori_s in tqdm(batch):
        s = ori_s.lower()
        ret = langid.classify(s)
        if ret[0] != "en" and ret[1] < -100: # remove language other than english
            continue

        processed, tokens = preprocess(s)
        if len(tokens) <= 5: # remove too short sentence
            continue
        
        reduced_batch.append(ori_s)
        clean_batch.append(processed)
        token_batch.append(tokens)
    return clean_batch, reduced_batch, token_batch

clean_data = []
reduced_data = []
token_data = []

n_workers = n_cpu
from multiprocessing import Pool
ret = [None] * n_workers
n_data = len(data)
print("Origin Data length:",n_data)
with Pool(processes=n_workers) as pool:
    for i in range(n_workers):
        batch_start = (n_data // n_workers) * i 
        if i == n_workers - 1:
            batch_end = n_data
        else:
            batch_end = (n_data // n_workers) * (i + 1)
        batch = data[batch_start:batch_end]
        ret[i] = pool.apply_async(process_batch, [batch])
    pool.close()
    pool.join()

for result in ret:
    clean_batch, reduced_batch, token_batch = result.get()
    clean_data += clean_batch
    reduced_data += reduced_batch
    token_data += token_batch
print("done")

Origin Data length: 106478


100%|██████████| 26619/26619 [05:47<00:00, 76.63it/s] 
100%|██████████| 26619/26619 [05:58<00:00, 88.52it/s] 
100%|██████████| 26621/26621 [06:00<00:00, 73.78it/s] 
100%|█████████▉| 26571/26619 [06:01<00:00, 89.65it/s]
100%|██████████| 26619/26619 [06:02<00:00, 73.45it/s] 

done


In [10]:
output = {
    "clean_data": clean_data,
    "reduced_data": reduced_data,
    "token_data": token_data
}
with open(savepath, "wb") as f:
    pickle.dump(output, f)