# 概要

Juliaとawabi.jlによる形態素解の実験

## データの取得

In [27]:
using HTTP, JSON, CSV, TableOperations, DataFrames, LightXML, Plots

"""
HTTPでデータを取得する関数
"""
function api_get(url)
  count = 0
  while count < 10
    try
      response = HTTP.get(url)
      response.status == 200 && (
        return String(response.body);
      )
    finally
      count += 1
    end
  end
  return error("Error: bad url")
end

"""
ニコニコ動画のデータを収集しファイルに保存する関数

参考: https://dwango.github.io/niconico/genre_ranking/ranking_log
"""
function collect_nico_data(ranking_type, date, file_type, endpoint="https://dcdn.cdn.nimg.jp/nicovideo/old-ranking")
  url       = "$(endpoint)/$(ranking_type)/$(date)/$(file_type)"
  file_name = replace(url, "/" => "_")
  write(file_name, api_get(url))
  return file_name
end

"""
ファイル名リストからタグを検索し、ファイル名を返す関数
"""
function search_tag(tag, file_name)
  file_data_list = JSON.parsefile(file_name)
  for file_data ∈ file_data_list
    file_data["tag"] == tag && (
      return file_data["file"]
    )
  end
  return error("Error: $(tag) is not found")
end

"""
ファイル名からデータを集計し、タグごとのデータを返す関数
"""
function aggregate_by_tag(ac6_result_file)

  function get_tags_by_id(id)
    xml_element = "https://ext.nicovideo.jp/api/getthumbinfo/$(id)" |> api_get |> parse_string |> root
    for class ∈ ["thumb", "tags"]
      xml_element = find_element(xml_element, class)
      isnothing(xml_element) && (
        #=
        ここに入るときは削除されてるかも
        <nicovideo_thumb_response status="fail">
          <error>
            <code>DELETED</code>
            <description>deleted</description>
          </error>
        </nicovideo_thumb_response>
        =#
        return [];
      )
    end
    elements = [content(e) for e ∈ get_elements_by_tagname(xml_element, "tag")]
  end

  tag_data         = Dict{String,Dict{String,Int64}}()
  tag_data_by_view = Dict{String,Dict{String,Float64}}()
  ac6_results      = JSON.parsefile(ac6_result_file)
  AGGREGATE_COLS   = ["view", "like", "comment", "mylist"]
  tag_unique_set   = Set{String}()
  for ac6_result ∈ ac6_results
    tags = get_tags_by_id(ac6_result["id"])
    for tag ∈ tags
      push!(tag_unique_set, tag)
      haskey(tag_data, tag) || (
        tag_data[tag]         = Dict(col => 0 for col ∈ AGGREGATE_COLS);
        tag_data_by_view[tag] = Dict(col => 0 for col ∈ AGGREGATE_COLS);
      )
      for col ∈ AGGREGATE_COLS
        tag_data[tag][col] += ac6_result["count"][col]
      end
    end
  end
  for tag ∈ tag_unique_set
    for col ∈ ["like", "comment", "mylist"]
      tag_data_by_view[tag][col] = tag_data[tag][col] * 1000 / tag_data[tag]["view"]
    end
  end
  write("aggregate_$(ac6_result_file)", JSON.json(tag_data))
  write("aggregate_by_view_$(ac6_result_file)", JSON.json(tag_data_by_view))
  return tag_data_by_view
end

aggregate_by_tag

In [28]:
file_name_list_name = collect_nico_data("daily", "2023-09-23", "file_name_list.json")
ac6_file_name       = search_tag("アーマードコア6", file_name_list_name)
ac6_result_file     = collect_nico_data("daily", "2023-09-23", ac6_file_name)
tag_data_by_view    = aggregate_by_tag(ac6_result_file)

Dict{String, Dict{String, Float64}} with 497 entries:
  "acvi対戦動画"           => Dict("view"=>0.0, "like"=>0.0227683, "comment"=>0…
  "集団幻覚"               => Dict("view"=>0.0, "like"=>0.0256461, "comment"=>0…
  "ストライダー"           => Dict("view"=>0.0, "like"=>0.0488431, "comment"=>0…
  "seirenvoice"            => Dict("view"=>0.0, "like"=>0.0527236, "comment"=>0…
  "坊やよいこだねんねしな" => Dict("view"=>0.0, "like"=>0.0618375, "comment"=>0…
  "人格の重量過多"         => Dict("view"=>0.0, "like"=>0.0216482, "comment"=>0…
  "Voiceroid実況プレイ"    => Dict("view"=>0.0, "like"=>0.179487, "comment"=>0.…
  "玉入れ"                 => Dict("view"=>0.0, "like"=>0.0164542, "comment"=>0…
  "初投稿動画"             => Dict("view"=>0.0, "like"=>0.102041, "comment"=>0.…
  "cevio_ai"               => Dict("view"=>0.0, "like"=>0.181818, "comment"=>0.…
  "大日本帝国版"           => Dict("view"=>0.0, "like"=>0.0728155, "comment"=>0…
  "谷村ブラックタンク"     => Dict("view"=>0.0, "like"=>0.0164542, "comment"=>0…
  "様子のおかしい人です"   => Dict("view"=>0

In [31]:
"""
視聴回数あたりのいいね数、コメント数、マイリスト数を計算し、視覚化する
data_type: "julia_dict" or "json_file"
"""
function plot_data_by_view(tag_data_by_view, data_type = "julia_dict")

  data_type == "json_file" && (
    cd(@__DIR__());
    tag_data_by_view = JSON.parsefile(tag_data_by_view);
  )

  # 日本語
  gr(fontfamily="JuliaMono")

  # タイトルとリアクション割合の棒グラフを作成する
  df         = DataFrame()
  df.tag     = collect(keys(tag_data_by_view))
  df.mylist  = [tag_data_by_view[t]["mylist"]  for t in df.tag]
  df.like    = [tag_data_by_view[t]["like"]    for t in df.tag]
  df.comment = [tag_data_by_view[t]["comment"] for t in df.tag]
  df         = sort(df, :mylist, rev=false)
  df         = df[end-10:end, :]

  # 棒グラフを作成する
  bar(df.tag, df.mylist, label=["マイリスト"])
  title!("アーマードコア6 タグ毎に集計した（マイリスト数/視聴回数）のランキング")
  plot!(size=(2400, 1800))
  savefig("ac6_reactions_1k_view.svg")
end
# plot_data_by_view(tag_data_by_view)
plot_data_by_view("aggregate_by_view_https:__dcdn.cdn.nimg.jp_nicovideo_old-ranking_daily_2023-09-23_game_04.json", "json_file")

# plot_data_by_view("aggregate_https:__dcdn.cdn.nimg.jp_nicovideo_old-ranking_daily_2023-09-23_game_04.json", "json_file")

"/home/izumi/git/analysis_tools/nicovideo_aggregate/experimental/ac6_reactions_1k_view.svg"

In [37]:
ENV["GKS_ENCODING"] = "utf8"


"utf8"

In [13]:
pwd()


"/home/izumi/git/analysis_tools/nicovideo_aggregate/experimental"

In [5]:

using Pkg
Pkg.status()

[32m[1mStatus[22m[39m `~/git/analysis_tools/Project.toml`
  [90m[b89ecf66] [39mAwabi v0.1.3
  [90m[336ed68f] [39mCSV v0.10.11
  [90m[a93c6f00] [39mDataFrames v1.6.1
[32m⌃[39m [90m[c43c736e] [39mGenie v5.2.2
[33m⌅[39m [90m[cd3eb016] [39mHTTP v0.9.17
  [90m[7073ff75] [39mIJulia v1.24.2
  [90m[682c06a0] [39mJSON v0.21.4
  [90m[9c8b4983] [39mLightXML v0.9.0
  [90m[58dd65bb] [39mPlotly v0.4.1
  [90m[91a5bcdd] [39mPlots v1.39.0
  [90m[8162dcfd] [39mPrettyPrint v0.2.0
  [90m[ab02a1b2] [39mTableOperations v1.2.0
[36m[1mInfo[22m[39m Packages marked with [32m⌃[39m and [33m⌅[39m have new versions available, but those with [33m⌅[39m are restricted by compatibility constraints from upgrading. To see why use `status --outdated`


In [2]:
cd("../..")
pwd()


"/home/izumi/git/analysis_tools"