In [144]:
import numpy as np
from collections import Counter
import re
import jieba
from gensim.models import FastText
from scipy.spatial.distance import cosine
from sklearn.decomposition import TruncatedSVD

In [145]:
def split_sentence(text):
    #remove strange character
    text = text.replace("\r\n","")
    #split sentences
    sentences = re.split('([。?!！？.])', text)
    new_sentences = []
    for i in range(len(sentences)//2):
        sent = sentences[2*i] + sentences[2*i+1]
        new_sentences.append(sent)
    return new_sentences

def get_stop_words(stopwords_file):
    stop_words = []
    for line in open(stopwords_file, 'r', encoding='utf-8'):
        stop_words.append(line.replace('\n', ''))
    return stop_words

def get_tokens(sent):
    #remove punctuation
    sent = re.findall(r'[\d|\w]+', sent)
    stopwords = get_stop_words("stopwords.txt")
    words = list(jieba.cut("".join(sent)))
    #remove stopwords
    words = list(set(words) - set(stopwords))
    return words

In [146]:
def document_freq(text):
    sentences = split_sentence(text) 
    
    word_lists = [list(jieba.cut("".join(re.findall(r'[\d|\w]+', sentence)))) for sentence in sentences]
    
    words = [word for word_list in word_lists for word in word_list]
    
    word_freq = {w: c for w, c in Counter(words).items()}
    
    return word_freq

In [147]:
def SVD_modify(sentences_vec, npc=1):
    
    sentences_vec = np.array(sentences_vec)
    #print(sentences_vec)
    #print(sentences_vec.shape)
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(sentences_vec)
    u = svd.components_
    sentences_vec -= u.dot(u.transpose()) * sentences_vec
    return sentences_vec

In [148]:
def sentence_embedding(sentence, frequency, word2vec):
    
    alpha = 1e-4
    
    sent = re.findall(r'[\d|\w]+', sentence)
    
    words =jieba.cut("".join(sent)) 
    
    token_freq = {w: c for w, c in Counter(words).items()}
    
    high_token_freq = max(token_freq.values())
    
    high_freq = max(frequency.values())
    
    tokens = get_tokens(sentence)
    
    sentence_vec = np.zeros_like(word2vec.wv["跳舞"])
    
    for word in tokens:
        
        weight = alpha*token_freq.get(word, high_token_freq) /(alpha+frequency.get(word, high_freq))
        
        word_vec = weight * word2vec.wv[word]
        
        sentence_vec += word_vec
        
    sentence_vec /= len(tokens)
    

    return sentence_vec
    

In [149]:
def sentences_distance(text, title):
    
    #get sentence vector list
    sentences = split_sentence(text)
    
    word_freq = document_freq(text)
    
    print("Get sentence embedding...")
    
    print("Load model...")
    
    word2vec = FastText.load("../fasttext.model")
    
    print("Model is Done!")
    
    sentences_vec = [sentence_embedding(sentence, word_freq, word2vec) for sentence in sentences]
    
    print("sentence embedding is Done!")
    
    if title:
        sentences_vec += sentence_embedding(title, word_freq, word2vec)
    
    sentences_vec = SVD_modify(sentences_vec)
    
    #the mean of sentences vector is text vector
    text_vec = np.mean(sentences_vec)
    
    
    
    if title:
        sentences_vec = sentences_vec[1:]
        
    #get cosine
    sentences_cos = {}
    for iid, sentence_vec in enumerate(sentences_vec):
        sentences_cos[iid] = cosine(sentence_vec, text_vec)
    return sentences_cos, sentences
    

In [150]:
def sentences_ranking(text, title):
    
    print("Get sentence cos distance...")
    
    sentences_cos, sentences = sentences_distance(text, title)

    print("sentence cos distance is done!")
    
    #first sentence with reduce distance to half
    sentences_cos[0] /= 2
    
    sentences_cos = sorted(sentences_cos.items(), key = lambda x:x[1])
    
    return sentences_cos, sentences

In [151]:
def get_summarization_by_w2v_weight(text, title, summary_ratio):
    
    print("Get ranking of sentences...")
    
    sentences_ranking_id, sentences = sentences_ranking(text, title)
    
    print("ranking of sentences is done!")
    
    sentences_candidate = [sentence_id[0] for sentence_id in sentences_ranking_id[:len(sentences) // summary_ratio]]
    
    return " ".join([sentences[iid] for iid in sorted(sentences_candidate)])

In [152]:
if __name__ == '__main__':
    text = '网易娱乐7月21日报道林肯公园主唱查斯特·贝宁顿 Chester Bennington于今天早上,在洛杉矶帕洛斯弗迪斯的一个私人庄园自缢身亡,年仅41岁。此消息已得到洛杉矶警方证实。洛杉矶警方透露, Chester的家人正在外地度假, Chester独自在家,上吊地点是家里的二楼。一说是一名音乐公司工作人员来家里找他时发现了尸体,也有人称是佣人最早发现其死亡。林肯公园另一位主唱麦克信田确认了 Chester Bennington自杀属实,并对此感到震惊和心痛,称稍后官方会发布声明。Chester昨天还在推特上转发了一条关于曼哈顿垃圾山的新闻。粉丝们纷纷在该推文下留言,不相信 Chester已经走了。外媒猜测,Chester选择在7月20日自杀的原因跟他极其要好的朋友Soundgarden(声音花园)乐队以及AudioslaveChris乐队主唱 Cornell有关,因为7月20日是 Chris CornellChris的诞辰。而 Cornell于今年5月17日上吊自杀,享年52岁。 Chris去世后, Chester还为他写下悼文。对于 Chester的自杀,亲友表示震惊但不意外,因为 Chester曾经透露过想自杀的念头,他曾表示自己童年时被虐待,导致他医生无法走出阴影,也导致他长期酗酒和嗑药来疗伤。目前,洛杉矶警方仍在调查Chester的死因。据悉, Chester与毒品和酒精斗争多年,年幼时期曾被成年男子性侵,导致常有轻生念头。 Chester生前有过2段婚姻,育有6个孩子。林肯公园在今年五月发行了新专辑《多一丝曙光OneMoreLight》,成为他们第五张登顶ilboard排行榜的专辑。而昨晚刚刚发布新单《 Talking To Myself》MV'
    #text = '123'
    title = None
    summary_ratio = 4
    summarization = get_summarization_by_w2v_weight(text, title, summary_ratio)
    print(summarization)

Get ranking of sentences...
Get sentence cos distance...
Get sentence embedding...
Load model...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model is Done!
sentence embedding is Done!


  dist = 1.0 - uv / np.sqrt(uu * vv)


sentence cos distance is done!
ranking of sentences is done!
网易娱乐7月21日报道林肯公园主唱查斯特·贝宁顿 Chester Bennington于今天早上,在洛杉矶帕洛斯弗迪斯的一个私人庄园自缢身亡,年仅41岁。 此消息已得到洛杉矶警方证实。 林肯公园另一位主唱麦克信田确认了 Chester Bennington自杀属实,并对此感到震惊和心痛,称稍后官方会发布声明。
