In [1]:
import numpy as np
import networkx as nx

In [2]:
links = {
    'webpage-1': set(['webpage-2', 'webpage-4', 'webpage-5', 'webpage-6', 'webpage-8', 'webpage-9', 'webpage-10']),
    'webpage-2': set(['webpage-5', 'webpage-6']),
    'webpage-3': set(['webpage-10']),
    'webpage-4': set(['webpage-9']),
    'webpage-5': set(['webpage-2', 'webpage-4']),
    'webpage-6': set([]), # dangling page
    'webpage-7': set(['webpage-1', 'webpage-3', 'webpage-4']),
    'webpage-8': set(['webpage-1']),
    'webpage-9': set(['webpage-1', 'webpage-2', 'webpage-3', 'webpage-8', 'webpage-10']),
    'webpage-10': set(['webpage-2', 'webpage-3', 'webpage-8', 'webpage-9']),
}

In [3]:
def build_idx(links):
    return {web:str(idx) for idx, web in enumerate(list(links.keys()))}

In [4]:
build_idx(links)

{'webpage-1': '0',
 'webpage-2': '1',
 'webpage-3': '2',
 'webpage-4': '3',
 'webpage-5': '4',
 'webpage-6': '5',
 'webpage-7': '6',
 'webpage-8': '7',
 'webpage-9': '8',
 'webpage-10': '9'}

In [5]:
def build_graph(links):
    links_idx = build_idx(links)
    graph = {}
    
    for key, val in links.items():
        graph[links_idx[key]] = [links_idx[v] for v in val]
    
    return graph

In [6]:
graph = build_graph(links)
graph

{'0': ['8', '3', '1', '4', '7', '5', '9'],
 '1': ['5', '4'],
 '2': ['9'],
 '3': ['8'],
 '4': ['3', '1'],
 '5': [],
 '6': ['3', '2', '0'],
 '7': ['0'],
 '8': ['2', '1', '7', '9', '0'],
 '9': ['7', '8', '2', '1']}

In [7]:
def build_matrix(graph):
    n = len(graph)
    matrix = np.zeros((n, n))
    
    for i in range(n):
        if not graph[str(i)]:
            matrix[i,:] = np.ones((1, n)) * 1 / n
        
        for j in graph[str(i)]:
            matrix[i][int(j)] = 1 / len(graph[str(i)])
    
    return matrix

In [8]:
A = build_matrix(graph)
A

array([[0.        , 0.14285714, 0.        , 0.14285714, 0.14285714,
        0.14285714, 0.        , 0.14285714, 0.14285714, 0.14285714],
       [0.        , 0.        , 0.        , 0.        , 0.5       ,
        0.5       , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.5       , 0.        , 0.5       , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.1       , 0.1       , 0.1       , 0.1       , 0.1       ,
        0.1       , 0.1       , 0.1       , 0.1       , 0.1       ],
       [0.33333333, 0.        , 0.33333333, 0.33333333, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.       

In [9]:
def pangrank(matrix, eps = 1.0e-8, d = 0.85):
    
    n = matrix.shape[0]
    R = np.ones((n, 1)) * 1 / n
    new_R = np.zeros((n, 1))
    E = np.ones((n, n))

    while True:
        new_R = ((1 - d) / n * E + d * matrix.T).dot(R)
        if np.linalg.norm(new_R - R) <= eps:
            break
        else:
            R = new_R
    
    return new_R

In [10]:
pangrank(A)

array([[0.13009588],
       [0.13050742],
       [0.08116303],
       [0.08539887],
       [0.09427651],
       [0.09427651],
       [0.0230135 ],
       [0.0904399 ],
       [0.13934097],
       [0.13148741]])

In [11]:
g = nx.DiGraph(graph)

In [17]:
#G = nx.DiGraph(nx.path_graph(4))
pr = nx.pagerank(g)
pr

{'0': 0.1300956900118335,
 '1': 0.13050776900162686,
 '2': 0.08116315448394983,
 '3': 0.08539910029620076,
 '4': 0.09427630133632156,
 '5': 0.09427630133632156,
 '6': 0.023013537818899983,
 '7': 0.09044007990712699,
 '8': 0.1393406839069631,
 '9': 0.13148738190075582}

## keyword extraction

In [117]:
import jieba
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [378]:
news = "中新网8月3日电 据美国《世界日报》报道，在杀害女儿章莹颖的凶手克里斯滕森判处终身监禁后，章父曾心痛且近乎绝望地表示，希望被告无条件地告知章莹颖的下落，“请帮助结束我们的煎熬，让我们带莹颖回家”，2日虽然传出章的遗体丢在伊利诺伊州一处垃圾掩埋场，然而事发至今已逾两年，要真正找到莹颖遗体，恐怕也非易事。\
\
位于伊州中部的沃米利安郡境内，共有两处由“共和服务公司”(Republic Services)经营的垃圾掩埋场，目前还未确定调查人员是否已经展开搜索，垃圾掩埋场发言人2日表示，如有需要，他们会全力配合调查。\
\
共和服务公司在该郡的掩埋场，根据2014年记录，分别是每年可处理约22.8万吨垃圾的Brickyard Disposal Landfill，及每年可处理约4.1万吨垃圾的Illinois Landfill。\
\
要在过去两年累积数十万吨的“垃圾海”中，挖到章的遗骇，难度可想而知。\
\
另一方面，根据贝克特声明，检察官在告诉家人此讯息时，也提到克里斯滕森的辩护团队是在获得豁免的情况下，向政府提供了上述信息。\
\
贝克特说，章家人并没有收到这项消息是否真实的保证，这种情形也增加了“搜索垃圾场”可能又是一场空的可能性。"

In [373]:
news = "中新网8月3日电 据美国《世界日报》报道，在杀害女儿章莹颖的凶手克里斯滕森判处终身监禁后，章父曾心痛且近乎绝望地表示，希望被告无条件地告知章莹颖的下落，“请帮助结束我们的煎熬，让我们带莹颖回家”，2日虽然传出章的遗体丢在伊利诺伊州一处垃圾掩埋场，然而事发至今已逾两年，要真正找到莹颖遗体，恐怕也非易事。"

In [272]:
a = [1,2,3]
a.append([4,5])
a

[1, 2, 3, [4, 5]]

In [375]:
class KeyWord:
    def __init__(self, win_size=10, words=20):
        self.news = None
        self.win_size = win_size
        self.words = words
        self.stopwords = []
        try:
            with open('stopword.txt', 'r') as f:
                self.stopwords = f.read().splitlines()
        except:
            print("stopword.txt not exist")
            self.stopwords = []
    
    def __build_graph(self, tokens):
        graph = {}
        _m = len(tokens)

        for i in range(_m):
            left = i - self.win_size
            right = i + self.win_size
            if left < 0: left = 0
            if right > _m: right = _m
            
            graph[tokens[i]] = tokens[left:i]
            graph[tokens[i]] += tokens[i:right]


        g = nx.DiGraph(graph, cmap = plt.get_cmap('jet'),)

        pr = nx.pagerank(g)
        
        return pr
    
    def build_tokens(self, news):
        clean_news = ' '.join(re.findall(re.compile('[\w|\d]+'), news))
        
        tokens = ' '.join(jieba.cut(clean_news)).split()
        
        tokens = [t for t in tokens if t not in self.stopwords]
        
        return tokens
    
    def extract(self, news):
        
        tokens = self.build_tokens(news)
        
        #print(tokens)
        
        pr = self.__build_graph(tokens)

        pr = sorted(pr.items(), key=lambda x: x[1], reverse=True)
        return pr
        return [_pr[0] for _pr in pr][:self.words]
    

In [376]:
keyword = KeyWord()

In [358]:
a = ' '.join(keyword.build_tokens(news))

In [379]:
keyword.extract(news)

[('章', 0.024803697655102607),
 ('垃圾', 0.022606805914234904),
 ('贝克特', 0.020298222867835152),
 ('家人', 0.019937787136035966),
 ('克里斯滕森', 0.015235677030760246),
 ('搜索', 0.014215865414227761),
 ('收到', 0.014175296427624936),
 ('说', 0.014087681994751942),
 ('信息', 0.01376277836279981),
 ('这项', 0.013610993224916518),
 ('提供', 0.013590630130711896),
 ('两年', 0.013489435051990337),
 ('政府', 0.013399630389942291),
 ('万吨', 0.013092281820151772),
 ('约', 0.01305141500761649),
 ('消息', 0.013037903543312822),
 ('掩埋场', 0.012945686276560632),
 ('下', 0.012812352715638266),
 ('情况', 0.012615872328493859),
 ('真实', 0.012438546473740719),
 ('豁免', 0.01241860469581851),
 ('团队', 0.012216373991244043),
 ('Landfill', 0.012155982238770324),
 ('保证', 0.01186104229641508),
 ('辩护', 0.011369146021500402),
 ('情形', 0.0112762240253577),
 ('8', 0.011172760760141657),
 ('服务公司', 0.010592528604897596),
 ('共和', 0.010527487180488721),
 ('提到', 0.010509967381367995),
 ('增加', 0.010413653567059308),
 ('数十万吨', 0.010345772604603282),
 ('1

In [317]:
from summa import keywords

In [302]:
text = "Compatibility of systems of linear constraints over the set of natural numbers.\
Criteria of compatibility of a system of linear Diophantine equations, strict\
inequations, and nonstrict inequations are considered. Upper bounds for\
components of a minimal set of solutions and algorithms of construction of\
minimal generating sets of solutions for all types of systems are given.\
These criteria and the corresponding algorithms for constructing a minimal\
supporting set of solutions can be used in solving all the considered types\
systems and systems of mixed types."


In [380]:
print(keywords.keywords(a,words=20))

垃圾 掩埋场
章 遗体
章莹颖
克里斯滕森
贝克特
搜索
家人
调查
日
两年
landfill
共和 服务公司
月
约
日电 美国
一场空


In [381]:
a = [1,2]
b = [[3,4]]

In [382]:
b = [a] + b
b

[[1, 2], [3, 4]]