# 文档similarity分析
### 参考
1. https://www.datasciencecentral.com/profiles/blogs/document-similarity-analysis-using-elasticsearch-and-python
2. https://stackoverflow.com/questions/30588528/creating-a-term-document-matrix-in-python-from-elasticsearch-index
3. https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-mlt-query.html

In [65]:
import sys
sys.path.append("./")
import ElasticSearchClass
import importlib
importlib.reload(ElasticSearchClass)

import urllib
from lxml import etree
import time
import csv
import math

In [66]:
def crawl_link_to_index(indexName, idx, link, esUtil):
    print(idx, link)
    try:
        response= urllib.request.urlopen(link)
                    
        while response.getcode()==502:
            time.sleep(60)
            response= urllib.request.urlopen(link)
        page_content = response.read()
        html_parser = etree.HTMLParser(remove_comments=True)
        tree = etree.HTML(page_content, parser=html_parser)
        etree.strip_elements(tree, 'script')
        etree.strip_tags(tree, 'script')
        text_data = "\n".join(filter(lambda chunk: chunk != '',[t.strip() for t in tree.itertext()]))
        page_title = tree.find(".//title").text           
        esUtil.indexDocument(indexName,"page",idx,{"url": link,
                         "title": page_title,
                         "page_text": text_data
                        })
        print("-" * 10)
    except Exception as e:
        print(e)

In [81]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys()) #过滤出vec1和vec2都有的词条。
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    
    #print(vec1, vec2)
    print(intersection)
    print(numerator,sum1, sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def get_tv_dict(tvjson):
    return dict([ (k, v['term_freq'])  
                  for k,v in tvjson\
                  .get('term_vectors')\
                  .get('page_text')\
                  .get('terms')\
                  .items()])

In [82]:
def generate_mlt_report(indexName, max_link_id, esUtil):
    print(indexName, max_link_id)
    fd = open("output.csv",'w')
    out_csv = csv.writer(fd)
    
    for doc_id in range(max_link_id):
        try:
            d = esUtil.get(indexName, "page", doc_id)
            
            dsrc = d.get("_source")
            url = dsrc.get("url")
            title = dsrc.get("title")
            
            mlts = esUtil.moreLikeThis(indexName, "page", doc_id, ["page_text"])
            hits = mlts.get('hits').get('hits')
            tvjson = esUtil.termVector(indexName, "page", doc_id)
            #print(tvjson)
            tv1 = get_tv_dict(tvjson)
            wc = sum(tv1.values())
            r = [doc_id, title.encode('ascii', 'ignore'), url, wc]
            tmp_li = []
            for h in hits:
                tmp_tvjson = esUtil.termVector(indexName,"page",h.get('_id'))
                tmp_tv = get_tv_dict(tmp_tvjson)
                tmp_li.extend([[h.get('_id'), h.get('_score'),
                          get_cosine(tv1, tmp_tv) * 100,
                          h.get('_source').get('title').encode('ascii', 'ignore'),
                          h.get('_source').get('url'), sum(tmp_tv.values())]])
            tmp_li = sorted(tmp_li,key=lambda a:a[2],reverse=True)  
            tmp_li2 = []
            
            for do in tmp_li:
                tmp_li2 = tmp_li2+do
            
            r = r+tmp_li2
            out_csv.writerow(r)
            fd.flush()
        except Exception as e:
            print(e)
    fd.close()

In [83]:
if __name__=='__main__':
    esUtil = ElasticSearchClass.ElasticSearchClass("192.168.18.187", 9201)
    indexName = 'kpsindex'
    '''
    esUtil.createIndex(indexName, {
                          "mappings": {
                              "page": {
                                  "_source": { "enabled": True },
                                  "properties": {
                                      "url": {
                                          "type": "text"
                                      },
                                      "page_text": {
                                          "type": "text",
                                          "term_vector": "yes"
                                      },
                                      "title": {
                                          "type": "text",
                                          "term_vector": "yes"
                                      }
                                  }
                              }
                          }})
    '''
    _tlinks = open('urls_file.txt').read().strip().split('\n')
    doc_count = len(_tlinks)
    links = enumerate(_tlinks)
    
    print(doc_count)
    if links:
        #for idx, link in enumerate(_tlinks):
        #    crawl_link_to_index(indexName, idx, link, esUtil)
        #time.sleep(5)
        print("generating  more like this report.......")
        generate_mlt_report(indexName, doc_count, esUtil)
    else:
        print("no links")

4
generating  more like this report.......
kpsindex 4
{'false', 'little', 'look', 'from', 'dictionary', 'in', 'api', 'twitter', 'python', 'powered', 'data', 'no', 'not', 'command', 'center', 'put', 'test', 'demonstrates', 'ways', 'repository', 'pip', 'import', 'or', 'code', 'github', 'contact', 'have', 'it', 'new', 'latest', 'build', 'cd', 'any', 'true', 'below', 'i', 'first', 'post', 'learn', 'the', 'number', 'common', 'between', 'search', 'because', 'line', 'usr', 'will', 'os', 'folder', 'classification', 'download', 'installed', 'at', 'use', 'do', 'o', 'quite', 'case', 'user', 'navigation', 'be', 'install', 'are', 'free', 'terms', 'facebook', 'if', 'machine', 'has', 'to', 'so', 'with', 'of', 'each', 'primary', 'contains', 'other', 'software', 'more', 'home', 'can', 'like', 'above', 'start', 's', '1', 'we', 'read', 'as', 'which', 'built', 'tools', 'range', 'es', 'type', 'your', 'request', 'for', 'part', 'is', 'password', 'different', 'username', 'series', 'using', 'this', 'means', 'l