## 類似記事をDoc2vecで出す
やること
- 全記事から本文抽出 : どこまで出すかはポインツ([ref](http://d.hatena.ne.jp/s-yata/20100619/1276961636) )
- mecabを使って分かち書き
    - reference : [Ubuntu 14.04 に Mecab と mecab-python3 をインストール - Qiita](https://qiita.com/elm200/items/2c2aa2093e670036bb30)
- gensimをつかってdoc2vec
    - reference : [Cos類似度とDoc2Vecってどっちが良いの？](https://qiita.com/enta0701/items/87cbe783aeb44ddf41ce)
- 距離が出たら多次元尺度法でplotting
    - reference : [多次元尺度法で遊んでみる](http://d.hatena.ne.jp/download_takeshi/20100410/1270921957)

In [None]:
#coding: utf-8

from bs4 import BeautifulSoup
import urllib
from urllib import request
import time
import csv
import os
from argparse import ArgumentParser
import datetime
import json

def extract_urls(root_url):
    """
    トップページを指定すると、ブログ内に存在するurlをすべて抜き出してくれる
    """
    is_articles = True
    page = 1
    urls = []
    # writer = csv.writer(f, lineterminator='\n') # 改行コード（\n）を指定しておく
    while is_articles:
        try:
            html = request.urlopen("{}/archive?page={}".format(root_url, page))
        except urllib.error.HTTPError as e: 
            # HTTPレスポンスのステータスコードが404, 403, 401などの例外処理
            print(e.reason)
            break
        except urllib.error.URLError as e: 
            # アクセスしようとしたurlが無効なときの例外処理
            print(e.reason)
            break
        soup = BeautifulSoup(html, "html.parser")
        articles = soup.find_all("a",class_="entry-title-link")
        for article in articles:
            urls.append(article.get("href"))
        if len(articles) == 0:
            # articleがなくなったら終了
            is_articles = False
        page += 1
    return urls

def extract_text(url):
    try:
        html = request.urlopen(url)
    except urllib.error.HTTPError as e: 
        # HTTPレスポンスのステータスコードが404, 403, 401などの例外処理
        print(e.reason)
    except urllib.error.URLError as e: 
        # アクセスしようとしたurlが無効なときの例外処理
        print(e.reason)
    soup = BeautifulSoup(html, "html.parser")
    entry = soup.find("div",class_="entry-content")
    ps = entry.find_all(["p","h2","h3","li"])
    text = ""
    for p in ps:
        text += p.text
    return text


## mecabを使って単語抽出

In [None]:
import MeCab
def words(text):
    """
        文章から単語を抽出
    """
    out_words = []
    tagger = MeCab.Tagger('-Ochasen')
    tagger.parse('')
    node = tagger.parseToNode(text)

    while node:
        word_type = node.feature.split(",")[0]
        if word_type in ["名詞"]:        
            out_words.append(node.surface)
        node = node.next
    return out_words

In [None]:
# url = urls[2]
# text = extract_text(url)
# w_list = words(text)
# print(w_list)

## Doc2Vecに突っ込むべ

In [None]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from collections import OrderedDict
root_url = "http://www.procrasist.com"

urls = extract_urls(root_url)
training_docs = []
num = len(urls)
for i, url in enumerate(urls):
    print(i+1,"/",num," : ", urls[i])
    text = extract_text(url)
    w_list = words(text)
    training_docs.append(TaggedDocument(words=w_list, tags=["article-"+str(i)]))


In [None]:
model = Doc2Vec(documents=training_docs, min_count=1, dm=0)
tags = ["article-"+str(i) for i in range(len(training_docs))]

In [None]:
# import numpy as np
# M = np.zeros((len(tags),len(tags)))
# for i,tag1 in enumerate(tags):
#     for j,tag2 in enumerate(tags):
#         M[i][j] = 1-model.docvecs.similarity(tag1,tag2)

for i,tag in enumerate(tags):
    print(tag, urls[i])
    for items in model.docvecs.most_similar(tag):
        index = int(items[0].split("-")[1])
        print("\t{} : {}".format(items[1], urls[index]))


## 多次元尺度法
2Dにプロット

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import manifold
mds = manifold.MDS(n_components=2, dissimilarity="precomputed")
pos = mds.fit_transform(M)
plt.scatter(pos[:,0],pos[:,1],marker="x",alpha=0.5)