In [1]:
#using Pkg
#Pkg.add("JSON")
#Pkg.add("Clustering")

In [2]:
using JSON

In [3]:
# カレントフォルダにある拡張子が「txt」のファイル名の一覧
files = filter(f -> isfile(f)&&occursin(r".txt$", f), readdir("."))
# 複数回に分けられて取得したjsonデータをマージする
urls = []     # 記事の重複判定用
articles = []   # 重複を覗いた記事
days = Dict{String, Int}()
categories = Dict{String, Int}()
for file in files
    data = JSON.parsefile(file)
    for d in data
        url = d["url"]
        if !(url in urls)
            push!(articles, d)
            day = split(d["datetime"])[1]
            days[day] = get(days, day, 0) + 1
            category = d["category"]
            categories[category] = get(categories, category, 0) + 1
            push!(urls, url)
        end
    end
end

In [4]:
length(articles)

1562

In [5]:
# 発信日ごとの記事数
days

Dict{String, Int64} with 23 entries:
  "6/28(火… => 86
  "7/2(土)" => 71
  "7/6(水)" => 92
  "7/8(金)" => 98
  "7/5(火)" => 81
  "7/1(金)" => 83
  "7/4(月)" => 77
  "6/26(日… => 65
  "6/25(土… => 73
  "6/27(月… => 81
  "6/29(水… => 94
  "7/12(火… => 79
  "7/9(土)" => 64
  "6/22(水… => 10
  "7/13(水… => 2
  "7/10(日… => 67
  "6/24(金… => 72
  "7/11(月… => 109
  "6/21(火… => 2
  "6/30(木… => 90
  "7/3(日)" => 63
  "6/23(木… => 7
  "7/7(木)" => 96

In [6]:
# カテゴリごとの記事数
categories

Dict{String, Int64} with 8 entries:
  "local"         => 301
  "domestic"      => 317
  "sports"        => 204
  "entertainment" => 176
  "science"       => 79
  "it"            => 100
  "world"         => 192
  "business"      => 193

In [7]:
# マージされた記事を保存
filename = "yahoo_merged.txt"
open(filename, "w") do f
    println(f, json(articles))
end

In [8]:
# クレンジング
# 記事文字列からテキストデータを抽出し、形態素解析できるように加工する
#   ・句点で改行させ、不要な空白・空行を除去
function getlines(article::String)
    new_lines = []
    # 形態素解析に長文を渡したり、不要な呼び出しをしないように、文字列を調整
    ## 句点「。」の後で分割する
    lines = split(replace(article, r"。" => "。\n"), "\n")
    for ll in lines
        ## 行頭の空白文字列を削除
        ll = replace(ll, r"^[　 ]+" => "")
        ## 空行は削除
        if length(ll) == 0
            continue
        end
        # 処理済み文字列を格納
        push!(new_lines, ll)
    end
    new_lines
end

getlines (generic function with 1 method)

In [9]:
# 形態素解析して、語の一覧を返す
using Awabi
# 形態素解析器の設定
## Linux / Mac
#tokenizer = Tokenizer()
## Windows：
#dic = Dict("dicdir" => "C:\\Program Files (x86)\\MeCab\\dic\\ipadic")
#tokenizer = Tokenizer(dic)
## SageMaker Studio Lab
rcfile = "/home/studio-lab-user/mecab/etc/mecabrc"
tokenizer = Tokenizer(rcfile)

function countword(tokenizer, lines)
    # 数え上げ格納領域
    word_counts = Dict{String, Int}()

    # 形態素解析＆数え上げ
    for line in lines
        # 1文を形態素解析
        tokens = tokenize(tokenizer, line)
        new_tokens = []
        for token in tokens
            attr = split(token[2], ",")
            hinsi = attr[1]
            surface = token[1] # 表記
            basic = (attr[7] != "*") ? attr[7] : surface   # 形態素の基本形
            ## 
            if hinsi in ["名詞", "動詞", "形容詞", "副詞"] 
                push!(new_tokens, basic)
            end
        end
        # 形態素数を数え上げ
        for surface in new_tokens
            word_counts[surface] = get(word_counts, surface, 0) + 1
        end
    end
    word_counts
end

countword (generic function with 1 method)

In [10]:
# 語の頻度を表すDictの配列　→　一つのDictにマージ
function mergeword(list_word_counts)
    all_word_counts = Dict{String, Int}()
    for wc in list_word_counts
        mergewith!(+, all_word_counts, wc)  # Dictの合成、値は+演算
    end
    all_word_counts
end

# Bag of Words 作成
function makevector(labels, list_word_counts)
    list_vector = []
    for wc in list_word_counts
        vec = zeros(Int, length(labels))
        for (w, c) in wc
            i = findfirst(==(w), labels)
            vec[i] = c
        end
        push!(list_vector, vec)
    end
    list_vector
end

makevector (generic function with 1 method)

In [11]:
# 時間かかるので、測定してみる
@elapsed begin
    # 記事ごとの単語と頻度の一覧
    list_word_counts = []
    for article in articles
        text = article["detail"]
        lines = getlines(text)
        wc = countword(tokenizer, lines)
        push!(list_word_counts, wc)
        article["word_count"] = wc
    end
    # 全体の単語と頻度の一覧を作成
    all_word_counts = mergeword(list_word_counts)
    # 全体の単語一覧
    labels = sort(collect(keys(all_word_counts)))
    # 各記事ごとの単語ベクトル（Bag of Words）作成
    list_vector = makevector(labels, list_word_counts)

    # 行列に変換する. juliaはcolumn-major order
    mat = hcat(list_vector...)
end

98.600457649

In [None]:
using Clustering

# K-meansを使って、利用してカテゴリー数8個のクラスタに分類する
n_clusters = 8 #the number of clusters
result = kmeans(mat, n_clusters; maxiter=200, display=:none)
@assert nclusters(result) == n_clusters # verify the number of clusters
clust_numbers = assignments(result) # get the assignments of points to clusters
#cluster_sizes = counts(result) # get the cluster sizes
#cluster_centers = result.centers # get the cluster centers

In [None]:
# 元のカテゴリーと、クラスタリングの結果を比較する
# カテゴリごとに、各クラスタに含まれる記事数を求める
check_table = Dict([(name, zeros(Int, n_clusters)) for name in keys(categories)])
for (clust_no, article) in zip(clust_numbers, articles)
    category = article["category"]
    check_table[category][clust_no] += 1
end
check_table