## Search Tree -> Similar Words

In [1]:
csv_path = '/Users/mqgao/Workspace/Lecture/datasource/sqlResult_1558435.csv'

In [2]:
import pandas as pd

In [3]:
content = pd.read_csv(csv_path, encoding='gb18030')

In [4]:
content = content.fillna('')

In [5]:
news_content = content['content'].tolist()

In [6]:
import jieba

In [7]:
def cut(string): return ' '.join(jieba.cut(string))

In [8]:
cut('这是一个测试')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/lx/xy106sq15v1c9hbk_hg9wbdh0000gn/T/jieba.cache
Loading model cost 0.765 seconds.
Prefix dict has been built succesfully.


'这是 一个 测试'

In [9]:
import re

In [10]:
def token(string):
    return re.findall(r'[\d|\w]+', string)

In [11]:
token('这是一个测试\n\n\n')

['这是一个测试']

In [12]:
news_content = [token(n) for n in news_content]

In [13]:
news_content = [' '.join(n) for n in news_content]

In [None]:
news_content = [cut(n) for n in news_content]

In [None]:
news_content[1]

In [None]:
with open('news-sentences-cut.txt', 'w') as f:
    for n in news_content:
        f.write(n + '\n')

In [None]:
from gensim.models import Word2Vec

In [None]:
from gensim.models.word2vec import LineSentence

In [None]:
news_word2ve= Word2Vec(LineSentence('news-sentences-cut.txt'), size=35, workers=8)

In [None]:
news_word2ve.most_similar('葡萄牙', topn=20)

## More Date, Better Results

1. 分词的问题
2. **数据量**，数据越多，效果越好，维基百科加进来，那么同义词就要好很多。

In [None]:
news_word2ve.most_similar('捷克', topn=20)

In [None]:
news_word2ve.most_similar('说', topn=30)

In [None]:
news_word2ve.most_similar('认为', topn=30)

In [None]:
news_word2ve.most_similar('建议', topn=10)

In [None]:
from collections import defaultdict

In [None]:
def get_related_words(initial_words, model):
    """
    @initial_words are initial words we already know
    @model is the word2vec model
    """
    
    unseen = initial_words
    
    seen = defaultdict(int)
    
    max_size = 500  # could be greater
    
    while unseen and len(seen) < max_size:
        if len(seen) % 50 == 0: 
            print('seen length : {}'.format(len(seen)))
            
        node = unseen.pop(0)
        
        new_expanding = [w for w, s in model.most_similar(node, topn=20)]
        
        unseen += new_expanding
        
        seen[node] += 1
        
        # optimal: 1. score function could be revised
        # optimal: 2. using dymanic programming to reduce computing time
    
    return seen

In [None]:
len(news_word2ve.wv.vocab)

In [None]:
related_words = get_related_words(['说', '表示'], news_word2ve)

In [None]:
sorted(related_words.items(), key=lambda x: x[1], reverse=True)

## TFIDF Key words

In [None]:
news_content[0]

In [None]:
def document_frequency(word): 
    return sum(1 for n in news_content if word in n)

In [None]:
document_frequency('的')

In [None]:
import math

In [None]:
def idf(word):
    """Gets the inversed document frequency"""
    return math.log10(len(news_content) / document_frequency(word))

In [None]:
idf('的') < idf('小米')

In [None]:
def tf(word, document):
    """
    Gets the term frequemcy of a @word in a @document.
    """
    words = document.split()
    
    return sum(1 for w in words if w == word)

In [None]:
content['content'][11]

In [None]:
tf('银行', news_content[11])

In [None]:
tf('创业板', news_content[11])

In [None]:
idf('创业板')

In [None]:
idf('银行')

In [None]:
idf('短期')

In [None]:
tf('短期', news_content[11])

In [None]:
def get_keywords_of_a_ducment(document):
    words = set(document.split())
    
    tfidf = [
        (w, tf(w, document) * idf(w)) for w in words
    ]
    
    tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True)
    
    return tfidf

In [None]:
news_content[0]

In [None]:
news_content[11]

In [None]:
%prun get_keywords_of_a_ducment(news_content[0])

In [None]:
machine_new_keywords = get_keywords_of_a_ducment(news_content[101])

In [None]:
news_content[101]

In [None]:
get_keywords_of_a_ducment(news_content[101])

## Wordcloud

In [None]:
import wordcloud

In [None]:
wc = wordcloud.WordCloud('/Users/mqgao/Downloads/SourceHanSerifSC-Regular.otf')
# we could download the font from https://github.com/Computing-Intelligence/datasource

In [None]:
news_content[4]

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
help(wc.generate_from_frequencies)

In [None]:
machine_new_keywords_dict = {w: score for w, score in machine_new_keywords}

In [None]:
plt.imshow(wc.generate_from_frequencies(machine_new_keywords_dict))

In [None]:
shenzhen_social_news = get_keywords_of_a_ducment(news_content[4])

In [None]:
shenzhen_social_news

In [None]:
from PIL import Image

In [None]:
import numpy as np

In [None]:
police_mask = np.array(Image.open('/Users/mqgao/Downloads/0034.png_860.png'))

In [None]:
wordcloud_with_mask = wordcloud.WordCloud(
font_path='/Users/mqgao/Downloads/SourceHanSerifSC-Regular.otf', 
mask=police_mask
)

In [None]:
plt.imshow(wc.generate_from_frequencies({w:s for w, s in shenzhen_social_news[:20]}))

In [None]:
plt.imshow(wordcloud_with_mask.generate_from_frequencies({w:s for w, s in shenzhen_social_news[:20]}))

## TFIDF Vectorizezd

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorized = TfidfVectorizer(max_features=10000)

In [None]:
news_content[:10]

In [None]:
sample_num = 50000
sub_samples = news_content[:sample_num]

In [None]:
X = vectorized.fit_transform(sub_samples)

In [None]:
X.shape

In [None]:
vectorized.vocabulary_

In [None]:
np.where(X[0].toarray()) # get the positions which values are not zero

In [None]:
news_content[0]

In [None]:
vectorized.vocabulary_

In [None]:
import random 

In [None]:
document_id_1, document_id_2 = random.randint(0, 1000), random.randint(0, 1000)

In [None]:
document_id_1

In [None]:
document_id_2

In [None]:
news_content[document_id_1]

In [None]:
news_content[document_id_2]

In [None]:
vector_of_d_1 = X[document_id_1].toarray()[0]

In [None]:
vector_of_d_2 = X[document_id_2].toarray()[0]

In [None]:
random_choose = random.randint(0, 1000)

In [None]:
random_choose

In [None]:
news_content[random_choose]

In [None]:
from scipy.spatial.distance import cosine

In [None]:
def distance(v1, v2): return cosine(v1, v2)

In [None]:
distance([1, 1], [2, 2])

In [None]:
distance(X[random_choose].toarray()[0], X[document_id_1].toarray()[0])

In [None]:
distance(X[random_choose].toarray()[0], X[document_id_2].toarray()[0])

In [None]:
news_content[320]

In [None]:
news_content[72]

In [None]:
news_content[85]

In [None]:
news_content[8]

In [None]:
sorted(list(range(10000)), key=lambda i: distance(X[random_choose].toarray()[0], 
                                      X[i].toarray()[0]))

In [None]:
2**64

In [None]:
bin(19)

In [None]:
bin(49)

In [None]:
bin(38)

In [None]:
bin(49 & 38)

## Build Search Engine 

```
Input: Words
Output: Documents
```

In [None]:
def naive_search(keywords):
    news_ids = [i for i, n in enumerate(news_content) if all(w in n for w in keywords)]
    # O(D * w) 

In [None]:
%%timeit
naive_search('美军 司令 航母'.split())

In [None]:
len(news_content)

### Input word -> the documents which contain this word

In [None]:
X.shape

In [None]:
transposed_x = X.transpose().toarray()

In [None]:
word_2_id = vectorized.vocabulary_

In [None]:
word_2_id['今天']

In [None]:
id_2_word = {d: w for w, d in word_2_id.items()}

In [None]:
id_2_word[6195]

In [None]:
set(np.where(transposed_x[6195])[0])

In [None]:
'美军'

In [None]:
word_2_id['美军']

In [None]:
word_2_id['司令']

In [None]:
usa_force = set(np.where(transposed_x[7922])[0])

In [None]:
commander = set(np.where(transposed_x[2769])[0])

In [None]:
usa_force & commander

In [None]:
from functools import reduce

In [None]:
d1, d2, d3 = {1, 2, 3}, {4, 5, 6, 3, 2}, {1, 3, 4}

In [None]:
from operator import and_

In [None]:
reduce(and_, [d1, d2, d3])

In [None]:
def search_engine(query):
    """
    @query is the searched words, splited by space
    @return is the related documents which ranked by tfidf similarity
    """
    words = query.split()
    
    query_vec = vectorized.transform([' '.join(words)]).toarray()[0]

    candidates_ids = [word_2_id[w] for w in words]
    
    documents_ids = [
         set(np.where(transposed_x[_id])[0]) for _id in candidates_ids
    ]
    
    merged_documents = reduce(and_, documents_ids)
    # we could know the documents which contain these words
    sorted_docuemtns_id = sorted(merged_documents, key=lambda i: distance(query_vec, X[i].toarray()))

    return sorted_docuemtns_id

In [None]:
np.where(vectorized.transform(['美联储 加息 次数']).toarray()[0])

In [None]:
text = """新华社洛杉矶４月８日电（记者黄恒）美国第三舰队８日发布声明说，该舰队下属的“卡尔·文森”航母战斗群当天离开新加坡，改变原定驶往澳大利亚的任务计划，转而北上，前往西太平洋朝鲜半岛附近水域展开行动。\n　　该舰队网站主页发布的消息说，美军太平洋司令部司令哈里·哈里斯指示“卡尔·文森”航母战斗群向北航行。这一战斗群包括“卡尔·文森”号航空母舰、海军第二航空队、两艘“阿利·伯克”级导弹驱逐舰和一艘“泰孔德罗加”级导弹巡洋舰。\n　　“卡尔·文森”号航母的母港位于美国加利福尼亚州的圣迭戈，今年１月初前往西太平洋地区执行任务，并参与了日本及韩国的军事演习。\n　　美国有线电视新闻网援引美国军方官员的话说，“‘卡尔·文森’号此次行动是为了对近期朝鲜的挑衅行为作出回应”。（完）"""

In [None]:
print(text)

In [None]:
import re

In [None]:
text = """美国有线电视新闻网援引美国军方官员的话说"""

In [None]:
pat = r'(新闻|官员)'

In [None]:
re.compile(pat).sub(repl="**\g<1>**", string=text)

In [None]:
def get_query_pat(query):
    return re.compile('({})'.format('|'.join(query.split())))

In [None]:
get_query_pat('美军 司令 航母')

In [None]:
def highlight_keywords(pat, document):
    return pat.sub(repl="**\g<1>**", string=document) 

In [None]:
highlight_keywords(get_query_pat('美军 司令 航母'), content['content'][22987])

In [None]:
from IPython.display import display, Markdown

In [None]:
def search_engine_with_pretty_print(query):
    candidates_ids = search_engine(query)
    for i, _id in enumerate(candidates_ids):
        title = '## Search Result {}'.format(i)
        c = content['content'][_id]
        c = highlight_keywords(get_query_pat(query), c)    
        
        display(Markdown(title + '\n' + c))

In [None]:
search_engine_with_pretty_print('春节 假期')

In [None]:
search_engine()

In [None]:
#%%timeit
search_engine('美联储 加息 次数')

In [None]:
content['content'][2189]

## Why ?

```
preprocessing
```

## PageRank

In [None]:
import networkx as nx

In [None]:
import random

In [None]:
from string import ascii_uppercase

In [None]:
ascii_uppercase

In [None]:
def genearte_random_website():
    return ''.join([random.choice(ascii_uppercase) for _ in range(random.randint(3, 5))]) + '.'  + random.choice(['com', 'cn', 'net'])

In [None]:
genearte_random_website()

In [None]:
websites = [genearte_random_website() for _ in range(25)]

In [None]:
websites

In [None]:
random.sample(websites, 10)

In [None]:
website_connection = {
    websites[0]: random.sample(websites, 10),
    websites[1]: random.sample(websites, 5),
    websites[3]: random.sample(websites, 7),
    websites[4]: random.sample(websites, 2),
    websites[5]: random.sample(websites, 1),
}

In [None]:
website_network = nx.graph.Graph(website_connection)

In [None]:
plt.figure(3,figsize=(12,12))
nx.draw_networkx(website_network, font_size=10)

In [None]:
sorted(nx.pagerank(website_network).items(),key=lambda x: x[1], reverse=True)