In [2]:
import numpy as np
from scipy.spatial.distance import cosine
from gensim.models import FastText



In [3]:
import re
import jieba    

In [4]:

def split_sentence(text):
    #remove strange character
    text = text.replace("\r\n","")
    #split sentences
    sentences = re.split('([。?!！？.])', text)
    new_sentences = []
    for i in range(len(sentences)//2):
        sent = sentences[2*i] + sentences[2*i+1]
        new_sentences.append(sent)
    return new_sentences
def get_stop_words(stopwords_file):
    stop_words = []
    for line in open(stopwords_file, 'r', encoding='utf-8'):
        stop_words.append(line.replace('\n', ''))
    return stop_words

#Chinese
def get_tokens(sent):
    #remove punctuation
    sent = re.findall(r'[\d|\w]+', sent)
    stopwords = get_stop_words("stopwords.txt")
    words = list(jieba.cut("".join(sent)))
    #remove stopwords
    words = list(set(words) - set(stopwords))
    return words

#English
def get_token(sent):

    stopwords = get_stop_words("stopwords.txt")

    words = list(sent.split())
    #remove stopwords
    words = list(set(words) - set(stopwords))
    return words

In [5]:
def get_sentence_cos(text, title):
    word2vec = FastText.load("../fasttext.model")
    #preprocess text
    sentences = split_sentence(text)
    tokens = [get_tokens(sentence) for sentence in sentences]
            
    #text vector and title vector
    text_vec = np.zeros_like(word2vec.wv["测试"])
    sents_vec = []
    
    for i, words in enumerate(tokens):
        sent_vec = np.zeros_like(word2vec.wv["测试"])
        for word in words:
            sent_vec += word2vec.wv[word]
            sents_vec.append(sent_vec)
        #first sentence will add two more times
        if i == 0:
            text_vec += sent_vec*3
        else:
            text_vec += sent_vec
            
    #preprocess title
    title_flag = False
    title_vec = np.zeros_like(word2vec.wv["测试"])
    if title:
        title_flag = True
        title_tokens = get_tokens(title)
        title_vec = np.sum([word2vec.wv(word) for word in title_tokens])/len(title_tokens)
    #add 5 more times title vector
    text_vec += title_vec*3
    #text vector
    text_vec = text_vec/(len(sentences)+5)
    
    #get cos between text and each sentence
    sentences_cos_id = {}
    for i, sent in enumerate(sents_vec):
        sentences_cos_id[i] = cosine(sent, text_vec)
    return sentences_cos_id, sentences
        

In [6]:
def sentences_ranking(text, title):
    
    sentences_cos_id, sentences = get_sentence_cos(text, title)
    #first sentence with reduce distance to half
    sentences_cos_id[0] /= 2
    sorted_sentences_cos_id = sorted(sentences_cos_id.items(), key = lambda x: x[1], reverse = False)
    
    return sorted_sentences_cos_id, sentences

In [7]:
def get_summarization_by_naive_sentvec(text, title, summary_ratio):
    if text == None:
        print("please input text")
        return None
    
    sentences_cos_id, sentences = sentences_ranking(text, title)

    sentences_candidate = sentences_cos_id[:(len(sentences)//summary_ratio)]
    
    return " ".join([sentences[sents_id[0]] for sents_id in sorted(sentences_candidate)])

In [10]:
if __name__ == "__main__":
    text = "网易娱乐7月21日报道林肯公园主唱查斯特·贝宁顿 Chester Bennington于今天早上,在洛杉矶帕洛斯弗迪斯的一个私人庄园自缢身亡,年仅41岁。此消息已得到洛杉矶警方证实。洛杉矶警方透露, Chester的家人正在外地度假, Chester独自在家,上吊地点是家里的二楼。一说是一名音乐公司工作人员来家里找他时发现了尸体,也有人称是佣人最早发现其死亡。林肯公园另一位主唱麦克信田确认了 Chester Bennington自杀属实,并对此感到震惊和心痛,称稍后官方会发布声明。Chester昨天还在推特上转发了一条关于曼哈顿垃圾山的新闻。粉丝们纷纷在该推文下留言,不相信 Chester已经走了。外媒猜测,Chester选择在7月20日自杀的原因跟他极其要好的朋友Soundgarden(声音花园)乐队以及AudioslaveChris乐队主唱 Cornell有关,因为7月20日是 Chris CornellChris的诞辰。而 Cornell于今年5月17日上吊自杀,享年52岁。 Chris去世后, Chester还为他写下悼文。对于 Chester的自杀,亲友表示震惊但不意外,因为 Chester曾经透露过想自杀的念头,他曾表示自己童年时被虐待,导致他医生无法走出阴影,也导致他长期酗酒和嗑药来疗伤。目前,洛杉矶警方仍在调查Chester的死因。据悉, Chester与毒品和酒精斗争多年,年幼时期曾被成年男子性侵,导致常有轻生念头。 Chester生前有过2段婚姻,育有6个孩子。林肯公园在今年五月发行了新专辑《多一丝曙光OneMoreLight》,成为他们第五张登顶ilboard排行榜的专辑。而昨晚刚刚发布新单《 Talking To Myself》MV"
    title = None
    summary_ratio = 2
    text = "I took EECS 545 machine learning class last semester. I implement different kinds of supervised machine learning algorithm step by step in python. For example, kNN classifier, Regularized  Linear  Regression, Weighted Linear Regression, logistic regression, ridge regression, Support Vector Regression, Bayesian, Gradient and Coordinate Decent and Neural Nets. And some supervised machine learning algorithm, like LDA, QDA, K-means, pca. Thus, I am not only familiar how to use machine learning model from some packages, like sklearn, but also know about how to implement in python and mathematic  deduction behind different model. In addition, I have some deep learning experiences with machine learning framework, like PyTorch and Tensorflow. "
    summarization = get_summarization_by_naive_sentvec(text, title, summary_ratio)
    print(summarization)

I took EECS 545 machine learning class last semester.  I implement different kinds of supervised machine learning algorithm step by step in python.  In addition, I have some deep learning experiences with machine learning framework, like PyTorch and Tensorflow.
