In [None]:
from crawling import *
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from pymongo import MongoClient
# need dnspython, PyOpenSSL, requests and service_identity for srv and ocsp
from urllib import parse


In [None]:
client = MongoClient(
    "mongodb+srv://cluster0.c6ccx.mongodb.net",
    authsource="$external",
    authmechanism="MONGODB-X509",
    tls=True,
    tlsCertificateKeyFile="../../../mongodb.pem",
)
db = client["webtoon"]
collection = db["data"]
collection_backup = db["backup"]
collection_backup2 = db["backup2"]

In [None]:
# reset collection with backup
collection.drop()
collection.insert_many(collection_backup.find())

In [None]:
# first process
root_url = "https://comic.naver.com/webtoon/weekday"
bar = ProgressBar(len(html_parser(root_url, dynamic=False).select(".thumb>a")))
for tag in html_parser(root_url, dynamic=False).select(".thumb>a"):
    bar.next()
    title = tag.select_one("img").attrs["title"]
    url = "https://comic.naver.com" + tag.attrs["href"]
    url_query = get_url_query(url)
    weekday = url_query["weekday"]
    titleId = url_query["titleId"]
    if collection.find_one({"title": title}):
        collection.update_one({"title": title}, {"$addToSet": {"weekday": weekday}})
    else:
        collection.insert_one({"title": title, "url": url, "weekday": [weekday], "titleId": titleId})
len(list(collection.find())) # different from expacted value because of duplicate

In [None]:
# add backup
pprint(list(collection.find()))
collection_backup.drop()
collection_backup.insert_many(collection.find())

In [None]:
import sys

# datetime has smaller size than string!
print(sys.getsizeof(datetime(2021, 8, 21)), sys.getsizeof("2021.08.21"))

In [None]:
# second process with progressbar
first_data = list(collection.find())
bar = ProgressBar(len(first_data))
for document in first_data:
    bar.next()
    html = html_parser(document["url"], dynamic=False)
    collection.update_one(
        {"title": document["title"]},
        {
            "$set": {
                "writer": get_text(html.select_one(".wrt_nm")).split("/"),
                "genre": get_text(html.select_one(".genre")).split(","),
                "age": get_text(html.select_one(".age"))
            }
        },
    )
    target_url = document["url"]
    while True:
        target_html = html_parser(target_url, dynamic=False)
        target_trs = target_html.select("table.viewList>tr")
        for target_tr in target_trs:
            if target_tr.get("class") is None:
                date = datetime.strptime(get_text(target_tr.select_one(".num")), "%Y.%m.%d")
                if date.year == 2020:
                    break
                if date.year == 2022:
                    continue
                title = target_tr.select_one("td.title>a")
                collection.update_one(
                    {"title": document["title"]},
                    {
                        "$addToSet": {
                            "episode": {
                                "no": int(get_url_query(title.attrs["href"])["no"]),
                                "subno": get_text(title),
                                "rating": float(get_text(target_tr.select_one(".rating_type>strong"))),
                                "date": date,
                                "url":title.attrs["href"]
                            }
                        }
                    },
                )
        else:
            next = target_html.select_one(".next")
            if next is not None:
                target_url = "https://comic.naver.com" + next.attrs["href"]
                continue
        break

In [None]:
root_episode_url ="https://comic.naver.com/webtoon"

bar = ProgressBar(len(list(collection.find())))
for document in collection.find():    
    bar.next()
    for episode in document.get("episode", []):
        collection.update_one(
            {"title": document["title"], "episode.no":episode["no"]},
            {
                "$set":{
                    "episode.$.url":set_url_query(root_episode_url, {"titleId":document["titleId"], "no":episode["no"]})
                }
            }
        )

In [None]:
pprint(list(collection.find()))

In [None]:
import requests
from urllib import parse
import re
import json


def getJson(url, query={}, init_api_url="https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json?ticket=comic&pool=cbox3&lang=ko&objectId=183559_517&pageSize=100&page=1&sort=NEW"):    
    # make valid header
    parsed_url = parse.urlparse(url)
    parsed_url_query = dict(parse.parse_qsl(parsed_url.query))
    for key in list(parsed_url_query.keys()): # remove weekday value
        if key not in ("titleId", "no"):
            parsed_url_query.pop(key)
    comment_url = parse.urlunparse(parsed_url._replace(path="/comment/comment", query=parse.urlencode(parsed_url_query)))
    header = {"referer":comment_url,
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"}
    print(header)

    # get api url using query of init api url
    parsed_init_api_url = parse.urlparse(init_api_url)
    new_objectId = "_".join(parsed_url_query.values())
    print(new_objectId)
    api_url = set_url_query(parsed_init_api_url, {"objectId":new_objectId})
    print(api_url)
    
    # removing useless prefix and subfix
    removeElse = re.compile("^[^{]+|[^}]+$")
    api_json = json.loads(removeElse.sub("", requests.get(api_url, headers=header).text))
    return api_json

In [None]:
preprocess_expr = {"$expr": {"$gte": [{"$divide": [{"$size": "$episode"}, {"$size": "$weekday"}]}, 40]}}
preprocess_query = {"$match":preprocess_expr}

In [None]:
from datetime import datetime


dateFormat = "%Y-%m-%dT%H:%M:%S%z"
for document in collection.find():
    for i, episode in enumerate(document["episode"]):
        while True:
            target_url = episode[""]
            pagenum = getJson(target_url)["result"]["pageModel"]["lastPage"]
            if i == len(document["episode"])-1:
                end_date = datetime(2022,1,1)
            else:
                end_date = document["episode"][i+1]["date"]
            comment_data = []
            break_seq = False
            while pagenum > 0 and not break_seq:
                comment_json = getJson(target_url, {"page":pagenum})
                comment_list = comment_json["result"]["commentList"][::-1]
                for comment in comment_list:
                    if datetime.strptime(comment["regTime"], dateFormat) >= end_date:
                        break_seq = True
                        break
                    comment_data.append({"userIdNo":comment["userIdNo"], "regTime":comment["regTime"],"sympathyCount":comment["sympathyCount"], "antipathyCount":comment["antipathyCount"]})
            # collection.update_one({})

In [None]:
# preprocess
preprocess_expr = {"$expr": {"$gte": [{"$divide": [{"$size": "$episode"}, {"$size": "$weekday"}]}, 40]}}
preprocess_query = {"$match":preprocess_expr}
pre_data = collection.find(
    {"$expr": {"$gte": [{"$divide": [{"$size": "$episode"}, {"$size": "$weekday"}]}, 40]}}
)

In [None]:
pprint(pre_data)

In [None]:
avg_rating_query = {"$project": {"_id": 0, "title": 1, "url": 1, "avg_rating": {"$avg": "$rating_info.rating"}}}
max_rating_webtoon = list(
    collection.aggregate([avg_rating_query, {"$sort": {"avg_rating": -1}}, {"$limit": 10}])
)
min_rating_webtoon = list(collection.aggregate([avg_rating_query, {"$sort": {"avg_rating": 1}}, {"$limit": 10}]))

In [None]:
pprint(max_rating_webtoon)
pprint(min_rating_webtoon)

In [None]:
import warnings
from matplotlib import font_manager, rc

warnings.filterwarnings(action="ignore")
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc("font", family=font)

In [None]:
plt.bar([data["title"] for data in max_rating_webtoon], [data["avg_rating"] for data in max_rating_webtoon])
plt.xticks(rotation=90)
ax = plt.gca()
ax.set_ylim([9.9, 10])
plt.show()
plt.bar([data["title"] for data in min_rating_webtoon], [data["avg_rating"] for data in min_rating_webtoon])
plt.xticks(rotation=90)
ax = plt.gca()
ax.set_ylim([0, 10])
plt.show()


In [None]:
std_rating_query = {"$project": {"_id": 0, "title": 1, "url": 1, "std_rating": {"$stdDevPop": "$rating_info.rating"}}}
max_std_rating_webtoon = list(
    collection.aggregate([std_rating_query, {"$sort": {"std_rating": -1}}, {"$limit": 10}])
)



In [None]:
pprint(max_std_rating_webtoon)


In [None]:
for webtoon in max_std_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    rating_info = data["rating_info"]
    plt.plot(rating_info["date"], rating_info["rating"])
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()


In [None]:
from scipy import stats

for data in collection.find():
    time_data = list(map(lambda x: x.timestamp() / 3600 / 24, data["rating_info"]["date"]))
    rating_data = data["rating_info"]["rating"]
    result = stats.linregress(time_data, rating_data)
    slope, r_squared = result.slope, result.rvalue ** 2
    collection.update_one(
        {"title": data["title"]}, {"$set": {"rating_info.slope": slope, "rating_info.r_squared": r_squared}}
    )



In [None]:
def get_total_sum_of_square(x, y):
    stats.linregress(time_data)
    pass


In [None]:
lin_rating_query = {"$project": {"_id": 0, "title": 1, "url": 1, "slope": "$rating_info.slope"}}
inc_rating_webtoon = list(collection.aggregate([lin_rating_query, {"$sort": {"slope": -1}}, {"$limit": 10}]))
dec_rating_webtoon = list(collection.aggregate([lin_rating_query, {"$sort": {"slope": 1}}, {"$limit": 10}]))



In [None]:
pprint(dec_rating_webtoon)

In [None]:
for webtoon in dec_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    rating_info = data["rating_info"]
    plt.plot(rating_info["date"], rating_info["rating"])
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()


In [None]:
r_squared_rating_query = {"$project": {"_id": 0, "title": 1, "url": 1, "r_squared": "$rating_info.r_squared"}}
min_r_squared_rating_webtoon = list(
    collection.aggregate([r_squared_rating_query, {"$sort": {"r_squared": 1}}, {"$limit": 10}])
)



In [None]:
list(collection.aggregate([r_squared_rating_query, {"$sort": {"r_squared": 1}}]))



In [None]:
min_r_squared_rating_webtoon


In [None]:
for webtoon in min_r_squared_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    rating_info = data["rating_info"]
    plt.plot(rating_info["date"], rating_info["rating"])
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()
