In [1]:
import dill
import pickle
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
from sklearn.preprocessing import MinMaxScaler
from tqdm._tqdm_notebook import tqdm_notebook

import jieba
jieba.set_dictionary('helper/dict.txt.big')
jieba.initialize()

Building prefix dict from C:\Users\Jiazhi\Kaggle Repo\CRSum+SF\helper\dict.txt.big ...
Loading model from cache C:\Users\Jiazhi\AppData\Local\Temp\jieba.ue68fbcada6799c78c240a9f241fbd839.cache
Loading model cost 1.119 seconds.
Prefix dict has been built succesfully.


### 讀取文本、打亂、並刪除null data

In [2]:
corpus_df = pd.read_pickle("data/corpus.pkl").dropna().sample(frac=1.0)
print("Shape:", corpus_df.shape)

Shape: (39990, 4)


## 對所有文本做分句分詞
### 檢查文章開頭是否為特別標籤

In [3]:
header_words = ["中國時報", "工商時報", "旺報", "▲"]
def contains_header_words(string):
    contain = False
    for word in header_words:
        contain = contain or word in string
    return contain

In [4]:
def parse_to_sentences(document):
    try:
        buffer, sentences = "", []
        header_tag, skip_flag = False, False

        for i, char in enumerate(document):
            if skip_flag:
                skip_flag = False
                continue

            buffer += ' ' if char == '\n' else char

            # Check if head of corpus is special tag
            if not sentences:
                if char in "(（【":
                    if not buffer[:-1].strip() or contains_header_words(buffer):
                        header_tag = True
                        continue

                if not header_tag and char == '導' and '／' in buffer and '報導' in buffer:
                    sentence = buffer.strip()
                    if sentence:
                        sentences.append(sentence)
                        buffer = ""

            if header_tag:
                if char in "】）)":
                    sentence = buffer.strip()
                    sentences.append(sentence)
                    buffer, header_tag = "", False
                    continue

            # Punc-based sentence seperation      
            if char in '，。?？!！:：;；':
                try:
                    if document[i+1] == "」":
                        buffer += document[i+1]
                        skip_flag = True
                except: None

                sentence = buffer.strip()
                if sentence:
                    sentences.append(sentence)
                    buffer = ""

        sentence = buffer.strip()
        if sentence:
            sentences.append(sentence)
        
        return sentences if sentences else None
    except:
        return None

### 分句並刪除null data

In [5]:
tqdm_notebook.pandas(desc="分句")
corpus_df.document = corpus_df.document.progress_apply(parse_to_sentences).dropna()
corpus_df = corpus_df.dropna()

print("Shape:", corpus_df.shape)
corpus_df.sample(5)

HBox(children=(IntProgress(value=0, description='分句', max=39990), HTML(value='')))


Shape: (39863, 4)


Unnamed: 0,id,source,summary,document
21823,21823,中央社,首輛菸害體驗車 用遊戲扎根菸害防制,"[（中央社記者許秩維台北4日電）, 為了讓學生瞭解菸害造成的影響和傷害，, 教育部與衛生福利..."
27391,27391,聯合新聞網_即時新聞,7旬翁痛失愛車 女警細心辦案及時尋回,"[嘉義縣7旬吳姓民眾年歲已大，, 行動較不靈活，, 無法步行離家太遠，, 平時就喜歡騎乘心愛..."
24605,24605,蘋果日報,葉文淇改名求轉運 變身26號葉家淇,"[桃猿左投葉文淇為求轉運，, 去年球季後段改名葉家淇，, 今年背號也由75號改成26號。, ..."
18152,18152,三立新聞,逃不過警犬鼻子…槍擊犯拒捕　掐牠脖子狠咬頭部還是輸了,"[美國新罕布夏州有一名槍擊犯拒捕，, 藏匿在一間民宅，, 警方找來警犬吠陀（Veda）搜捕，..."
26482,26482,聯合新聞網_即時新聞,MIT地板清潔神器登場 吸擦噴拖樣樣在行,"[年前掃除話題夯，, 想要輕鬆搞定地板清潔，, 現在又有款台灣研發製造的新商品強勢登場！, ..."


### 對內文與標題作分詞

In [6]:
%%time
tqdm_notebook.pandas(desc="內文分詞")
corpus_df.document = corpus_df.document.progress_apply(lambda x: [list(jieba.cut(sen)) for sen in x])
tqdm_notebook.pandas(desc="標題分詞")
corpus_df.summary = corpus_df.summary.progress_apply(lambda x: list(jieba.cut(x)))

HBox(children=(IntProgress(value=0, description='內文分詞', max=39863), HTML(value='')))




HBox(children=(IntProgress(value=0, description='標題分詞', max=39863), HTML(value='')))


Wall time: 1min 44s


### 刪除標題長度不足2之數據（會無法計算bigram）

In [7]:
def delete_invalid_summary(summary):
    try:
        return summary if len(summary) >= 2 else None
    except:
        return None

In [8]:
tqdm_notebook.pandas(desc="標題驗證")
corpus_df.summary = corpus_df.summary.progress_apply(delete_invalid_summary)
corpus_df = corpus_df.dropna()
print("Shape:", corpus_df.shape)

HBox(children=(IntProgress(value=0, description='標題驗證', max=39863), HTML(value='')))


Shape: (39833, 4)


### 儲存id, summary備用

In [9]:
corpus_df[['id', 'summary']].to_pickle("data/summary.pkl")

## 取各20x(10%)筆資料作為驗證集與測試集

In [10]:
one_tenth = len(corpus_df) // 200

train_df, valid_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for source in corpus_df.source.unique():
    test_df = pd.concat([test_df, corpus_df.loc[corpus_df.source==source][:one_tenth]])
    valid_df = pd.concat([valid_df, corpus_df.loc[corpus_df.source==source][one_tenth:one_tenth*2]])
    train_df = pd.concat([train_df, corpus_df.loc[corpus_df.source==source][one_tenth*2:]])
    
print("Train shape:", train_df.shape)
print("Valid shape:", valid_df.shape)
print("Test shape: ", test_df.shape)

Train shape: (31873, 4)
Valid shape: (3980, 4)
Test shape:  (3980, 4)


## 建立每篇文章的tf table

In [11]:
%%time
id_word2tf = {}
for _, row in corpus_df.iterrows():
    flatten_doc = [word for sen in row.document for word in sen]
    
    doc_len = len(flatten_doc)
    counter = Counter(flatten_doc)
    for word in counter:
        counter[word] /= doc_len
        
    id_word2tf[row.id] = counter

Wall time: 7.42 s


# 訓練集

### 建立並儲存df table

In [12]:
%%time
word2df = defaultdict(lambda: 0)
for _, row in train_df.iterrows():
    for word in set([word for sen in row.document for word in sen]):
        word2df[word] += 1
        
doc_count = len(train_df)
for word in word2df:
    word2df[word] /= doc_count

max_df_word = max(word2df.keys(), key=(lambda key: word2df[key]))
print("最大df的詞：'%s' - %.4f" % (max_df_word, word2df[max_df_word]))

最大df的詞：'，' - 0.9801
Wall time: 4.77 s


In [13]:
with open("helper/word2df.dill", 'wb') as file:
    dill.dump(word2df, file)

### 把文章堆砌成本文與前後文、padding、計算pos

In [14]:
M, N = 5, 5

def get_sen_labels():
    sen_labels = []
    for i in reversed(range(M)):
        sen_labels.append('stm'+str(i+1))
    sen_labels.append('st')
    for i in range(N):
        sen_labels.append('stn'+str(i+1))
    return sen_labels

In [15]:
def padding(sentence):
    sentence.insert(0, '<L>')
    sentence.append('<R>')
    return sentence

In [16]:
%%time
id_list, sen_list, sentences_list, pos_list = [], [], [], []
for _, row in train_df.iterrows():
    id, document = row.id, row.document
    doc_len = len([word for sen in document for word in sen])
    
    word_cursor = 0
    for sentence in document:
        word_cursor += len(sentence)
        pos_list.append(word_cursor / doc_len)
    
    document = list(map(padding, document))
    for _ in range(M):
        document.insert(0, [])
    for _ in range(N):
        document.append([])
    
    for i in range(len(document)-M-N):
        id_list.append(row.id)
        sen_list.append(i)
        sentences_list.append(document[i:i+M+N+1])

Wall time: 16.9 s


In [17]:
id_df = pd.DataFrame(id_list)
sen_df = pd.DataFrame(sen_list)
sentences_df = pd.DataFrame(sentences_list)
pos_df = pd.DataFrame(pos_list)

sentence_df = pd.concat([id_df, sen_df, sentences_df], axis=1)
sentence_df.columns = ['id', 'sen'] + get_sen_labels()

### 計算其他surface features與ROUGE-2

In [18]:
def get_tf(row):
    tf, word2tf = 0, id_word2tf[row.id]
    for word in row.st:
        tf += word2tf[word]
    return tf / len(row.st)

In [19]:
def get_df(row):
    df = 0
    for word in row.st:
        df += word2df[word]
    return df / len(row.st)

In [20]:
def get_bigrams(sentence):
    bigram = set()
    for i in range(0, len(sentence)-1):
        bigram.add((sentence[i], sentence[i+1]))
    return bigram

In [21]:
summary_bigrams_df = corpus_df[['id', 'summary']]
summary_bigrams_df.summary = summary_bigrams_df.summary.apply(get_bigrams)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [22]:
def get_rouge_2(row):
    if len(row.st) < 2:
        return 0
    
    st_bigrams = get_bigrams(row.st)
    summary_bigrams = summary_bigrams_df.loc[summary_bigrams_df.id == row.id].summary.values[0]
    
    overlap = 0
    for bigram in st_bigrams:
        overlap += bigram in summary_bigrams
    return overlap / len(summary_bigrams)

In [23]:
%%time
tqdm_notebook.pandas(desc="len")
len_df = sentence_df.st.progress_apply(lambda x: len(x)-2)
tqdm_notebook.pandas(desc="tf")
tf_df = sentence_df.progress_apply(get_tf, axis=1)
tqdm_notebook.pandas(desc="df")
df_df = sentence_df.progress_apply(get_df, axis=1)
tqdm_notebook.pandas(desc="ROUGE-2")
rouge_2_df = sentence_df.progress_apply(get_rouge_2, axis=1)

HBox(children=(IntProgress(value=0, description='len', max=1249873), HTML(value='')))




HBox(children=(IntProgress(value=0, description='tf', max=1249873), HTML(value='')))




HBox(children=(IntProgress(value=0, description='df', max=1249873), HTML(value='')))




HBox(children=(IntProgress(value=0, description='ROUGE-2', max=1249873), HTML(value='')))


Wall time: 13min 27s


In [24]:
train_df = pd.concat([sentence_df, len_df, pos_df, tf_df, df_df, rouge_2_df], axis=1)
train_df.columns = ['id', 'sen'] + get_sen_labels() + ['len', 'pos', 'tf', 'df', 'rouge_2']

print("Shape:", train_df.shape)
train_df[['id', 'sen', 'stm2', 'stm1', 'st', 'stn1', 'stn2', 'len', 'pos', 'tf', 'df', 'rouge_2']].head()

Shape: (1249873, 18)


Unnamed: 0,id,sen,stm2,stm1,st,stn1,stn2,len,pos,tf,df,rouge_2
0,6939,0,[],[],"[<L>, (, 新增, ：, <R>]","[<L>, 動, 新聞, ), <R>]","[<L>, 台北市, 中心, 又, 見, 隨機性, 侵狼, ，, <R>]",3,0.007634,0.002545,0.146939,0.0
1,6939,1,[],"[<L>, (, 新增, ：, <R>]","[<L>, 動, 新聞, ), <R>]","[<L>, 台北市, 中心, 又, 見, 隨機性, 侵狼, ，, <R>]","[<L>, 這次, 嫌犯, 竟是, 曾, 在, 海軍陸戰隊, 服役, 的, 壯漢, ！, <R>]",3,0.015267,0.002036,0.112396,0.0
2,6939,2,"[<L>, (, 新增, ：, <R>]","[<L>, 動, 新聞, ), <R>]","[<L>, 台北市, 中心, 又, 見, 隨機性, 侵狼, ，, <R>]","[<L>, 這次, 嫌犯, 竟是, 曾, 在, 海軍陸戰隊, 服役, 的, 壯漢, ！, <R>]","[<L>, 一名, 年輕, 女子, 數天, 前, 凌晨, 2, 點多, ，, <R>]",7,0.033079,0.015267,0.146139,0.0
3,6939,3,"[<L>, 動, 新聞, ), <R>]","[<L>, 台北市, 中心, 又, 見, 隨機性, 侵狼, ，, <R>]","[<L>, 這次, 嫌犯, 竟是, 曾, 在, 海軍陸戰隊, 服役, 的, 壯漢, ！, <R>]","[<L>, 一名, 年輕, 女子, 數天, 前, 凌晨, 2, 點多, ，, <R>]","[<L>, 獨自, 在, 台北市, 市民, 大道, 、, 中山北路, 口, 附近, 遛狗, ...",10,0.058524,0.006573,0.180605,0.0
4,6939,4,"[<L>, 台北市, 中心, 又, 見, 隨機性, 侵狼, ，, <R>]","[<L>, 這次, 嫌犯, 竟是, 曾, 在, 海軍陸戰隊, 服役, 的, 壯漢, ！, <R>]","[<L>, 一名, 年輕, 女子, 數天, 前, 凌晨, 2, 點多, ，, <R>]","[<L>, 獨自, 在, 台北市, 市民, 大道, 、, 中山北路, 口, 附近, 遛狗, ...","[<L>, 突遭, 一名, 壯漢, 尾隨, 、, 持刀, 抵住, 脖子, 搶劫, ，, <R>]",9,0.081425,0.014573,0.15938,0.0


### feature scaling

In [25]:
len_scaler = MinMaxScaler()
train_df.len = len_scaler.fit_transform(train_df.len.values.reshape(-1, 1))



In [26]:
with open("helper/len-scaler.pkl", 'wb') as file:
    pickle.dump(len_scaler, file)

In [27]:
train_df.to_pickle('data/train_set.pkl')

# 驗證集
### df table與scaler拿訓練集的來用

In [28]:
id_list, sen_list, sentences_list, pos_list = [], [], [], []
for _, row in valid_df.iterrows():
    id, document = row.id, row.document
    doc_len = len([word for sen in document for word in sen])
    
    word_cursor = 0
    for sentence in document:
        word_cursor += len(sentence)
        pos_list.append(word_cursor / doc_len)
    
    document = list(map(padding, document))
    for _ in range(M):
        document.insert(0, [])
    for _ in range(N):
        document.append([])
    
    for i in range(len(document)-M-N):
        id_list.append(row.id)
        sen_list.append(i)
        sentences_list.append(document[i:i+M+N+1])

In [29]:
id_df = pd.DataFrame(id_list)
sen_df = pd.DataFrame(sen_list)
sentences_df = pd.DataFrame(sentences_list)
pos_df = pd.DataFrame(pos_list)

sentence_df = pd.concat([id_df, sen_df, sentences_df], axis=1)
sentence_df.columns = ['id', 'sen'] + get_sen_labels()

In [30]:
%%time
tqdm_notebook.pandas(desc="len")
len_df = sentence_df.st.progress_apply(lambda x: len(x)-2)
tqdm_notebook.pandas(desc="tf")
tf_df = sentence_df.progress_apply(get_tf, axis=1)
tqdm_notebook.pandas(desc="df")
df_df = sentence_df.progress_apply(get_df, axis=1)
tqdm_notebook.pandas(desc="ROUGE-2")
rouge_2_df = sentence_df.progress_apply(get_rouge_2, axis=1)

HBox(children=(IntProgress(value=0, description='len', max=155691), HTML(value='')))




HBox(children=(IntProgress(value=0, description='tf', max=155691), HTML(value='')))




HBox(children=(IntProgress(value=0, description='df', max=155691), HTML(value='')))




HBox(children=(IntProgress(value=0, description='ROUGE-2', max=155691), HTML(value='')))


Wall time: 1min 36s


In [31]:
valid_df = pd.concat([sentence_df, len_df, pos_df, tf_df, df_df, rouge_2_df], axis=1)
valid_df.columns = ['id', 'sen'] + get_sen_labels() + ['len', 'pos', 'tf', 'df', 'rouge_2']
valid_df.len = len_scaler.transform(valid_df.len.values.reshape(-1, 1))
print("Shape:", valid_df.shape)

Shape: (155691, 18)


In [32]:
valid_df.to_pickle('data/valid_set.pkl')

# 測試集
### 與驗證集做一樣的處理

In [33]:
id_list, sen_list, sentences_list, pos_list = [], [], [], []
for _, row in test_df.iterrows():
    id, document = row.id, row.document
    doc_len = len([word for sen in document for word in sen])
    
    word_cursor = 0
    for sentence in document:
        word_cursor += len(sentence)
        pos_list.append(word_cursor / doc_len)
    
    document = list(map(padding, document))
    for _ in range(M):
        document.insert(0, [])
    for _ in range(N):
        document.append([])
    
    for i in range(len(document)-M-N):
        id_list.append(row.id)
        sen_list.append(i)
        sentences_list.append(document[i:i+M+N+1])

In [34]:
id_df = pd.DataFrame(id_list)
sen_df = pd.DataFrame(sen_list)
sentences_df = pd.DataFrame(sentences_list)
pos_df = pd.DataFrame(pos_list)

sentence_df = pd.concat([id_df, sen_df, sentences_df], axis=1)
sentence_df.columns = ['id', 'sen'] + get_sen_labels()

In [35]:
%%time
tqdm_notebook.pandas(desc="len")
len_df = sentence_df.st.progress_apply(lambda x: len(x)-2)
tqdm_notebook.pandas(desc="tf")
tf_df = sentence_df.progress_apply(get_tf, axis=1)
tqdm_notebook.pandas(desc="df")
df_df = sentence_df.progress_apply(get_df, axis=1)
tqdm_notebook.pandas(desc="ROUGE-2")
rouge_2_df = sentence_df.progress_apply(get_rouge_2, axis=1)

HBox(children=(IntProgress(value=0, description='len', max=156228), HTML(value='')))




HBox(children=(IntProgress(value=0, description='tf', max=156228), HTML(value='')))




HBox(children=(IntProgress(value=0, description='df', max=156228), HTML(value='')))




HBox(children=(IntProgress(value=0, description='ROUGE-2', max=156228), HTML(value='')))


Wall time: 1min 37s


In [36]:
test_df = pd.concat([sentence_df, len_df, pos_df, tf_df, df_df, rouge_2_df], axis=1)
test_df.columns = ['id', 'sen'] + get_sen_labels() + ['len', 'pos', 'tf', 'df', 'rouge_2']
test_df.len = len_scaler.transform(test_df.len.values.reshape(-1, 1))
print("Shape:", test_df.shape)

Shape: (156228, 18)


In [37]:
test_df.to_pickle('data/test_set.pkl')