In [None]:
from crawling import *
import datetime
import matplotlib.pyplot as plt
from pprint import pprint
from pymongo import MongoClient
# need dnspython for srv

In [None]:
client = MongoClient(
    "mongodb+srv://cluster0.c6ccx.mongodb.net",
    authsource="$external",
    authmechanism="MONGODB-X509",
    tls=True,
    tlsCertificateKeyFile="../../../mongodb.pem",
)
db = client["project"]
collection = db["webtoon"]
wordcloud = db["wordcloud"]
collection_backup = db["backup"]
collection_backup2 = db["backup2"]

In [None]:
# first process
root_url = "https://comic.naver.com/webtoon/weekday"
# 개인용 프로그레스바 생성
bar = ProgressBar(len(html_parser(root_url, dynamic=False).select(".thumb>a")))
# css 선택자를 통해서 타이틀, 연재요일 등등을 가져옴
for tag in html_parser(root_url, dynamic=False).select(".thumb>a"):
    bar.next()
    title = tag.select_one("img").attrs["title"]
    url = "https://comic.naver.com" + tag.attrs["href"]
    url_query = get_url_query(url)
    weekday = url_query["weekday"]
    titleId = url_query["titleId"]
    # 데이터 베이스 입력
    if collection.find_one({"title": title}):
        collection.update_one({"title": title}, {"$addToSet": {"weekday": weekday}})
    else:
        collection.insert_one({"title": title, "url": url, "weekday": [weekday], "titleId": titleId})
len(list(collection.find())) # different from expacted value because of duplicate

In [None]:
import sys

# datetime has smaller size than string!
print(sys.getsizeof(datetime.datetime(2021, 8, 21)), sys.getsizeof("2021.08.21"))

In [None]:
# second process with progressbar
first_data = list(collection.find())
bar = ProgressBar(len(first_data))
for document in first_data:
    bar.next()
    # 웹툰 하나마다 작가, 장르, 연령을 가져옴
    html = html_parser(document["url"], dynamic=False)
    collection.update_one(
        {"title": document["title"]},
        {
            "$set": {
                "writer": get_text(html.select_one(".wrt_nm")).split("/"),
                "genre": get_text(html.select_one(".genre")).split(","),
                "age": get_text(html.select_one(".age"))
            }
        },
    )

    target_url = document["url"]
    # 웹툰 한 화를 각각 들어감
    while True:
        target_html = html_parser(target_url, dynamic=False)
        target_trs = target_html.select("table.viewList>tr")
        for target_tr in target_trs:
            if target_tr.get("class") is None:
                # mongodb can't save timezone data!!!!! It defaults timezone to UTP+00:00
                # 웹툰이 올라온 날짜
                date = datetime.datetime.strptime(get_text(target_tr.select_one(".num")), "%Y.%m.%d").replace(tzinfo=datetime.timezone(datetime.timedelta(seconds=32400)))
                # 2021년의 웹툰만 필터링
                if date.year == 2020:
                    break
                if date.year == 2022:
                    continue
                title = target_tr.select_one("td.title>a")
                # 웹툰 한 화의 고유번호, 화 이름, 별점, 올린 날짜, 링크를 데이터베이스에 저장
                collection.update_one(
                    {"title": document["title"]},
                    {
                        "$addToSet": {
                            "episode": {
                                "no": int(get_url_query(title.attrs["href"])["no"]),
                                "subno": get_text(title),
                                "rating": float(get_text(target_tr.select_one(".rating_type>strong"))),
                                "date": date,
                                "url":"https://comic.naver.com"+title.attrs["href"]
                            }
                        }
                    },
                )
        else:
            # 다음페이지로 넘어감
            next = target_html.select_one(".next")
            if next is not None:
                target_url = "https://comic.naver.com" + next.attrs["href"]
                continue
        break

In [None]:
pprint(list(collection.find())[:10])

In [None]:
# 연재주가 최소 40주 이상인 웹툰을 가져오는 쿼리
preprocess_expr = {"$expr": {"$gte": [{"$divide": [{"$size": {"$ifNull":["$episode",[]]}}, {"$size": "$weekday"}]}, 40]}}
preprocess_query = {"$match":preprocess_expr}
len(list(collection.find(preprocess_expr)))

In [None]:
avg_rating_query = [{"$unwind":"$episode"}, {"$group": {"_id": "$title", "title": {"$first":"$title"}, "url": {"$first":"$url"}, "avg_rating": {"$avg": "$episode.rating"}}}]
max_rating_webtoon = list(collection.aggregate([preprocess_query, *avg_rating_query, {"$sort": {"avg_rating": -1}}]))
min_rating_webtoon = list(collection.aggregate([preprocess_query, *avg_rating_query, {"$sort": {"avg_rating": 1}}]))

In [None]:
pprint(max_rating_webtoon[:10])
pprint(min_rating_webtoon[:10])

In [None]:
import warnings
from matplotlib import font_manager, rc

# ignore warning
warnings.filterwarnings(action="ignore")
# add hangul font
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc("font", family=font)

In [None]:
plt.bar([data["title"] for data in max_rating_webtoon[:10]], [data["avg_rating"] for data in max_rating_webtoon[:10]])
plt.xticks(rotation=90)
ax = plt.gca()
ax.set_ylim([9.9, 10])
plt.show()
plt.bar([data["title"] for data in min_rating_webtoon[:10]], [data["avg_rating"] for data in min_rating_webtoon[:10]])
plt.xticks(rotation=90)
ax = plt.gca()
ax.set_ylim([0, 10])
plt.show()

In [None]:
std_rating_query = [{"$unwind":"$episode"}, {"$group": {"_id": "$title", "title": {"$first":"$title"}, "url": {"$first":"$url"}, "std_rating": {"$stdDevPop": "$episode.rating"}}}]
max_std_rating_webtoon = list(collection.aggregate([preprocess_query, *std_rating_query, {"$sort": {"std_rating": -1}}]))
min_std_rating_webtoon = list(collection.aggregate([preprocess_query, *std_rating_query, {"$sort": {"std_rating": 1}}]))

In [None]:
pprint(max_std_rating_webtoon[:10])
pprint(min_std_rating_webtoon[:10])

In [None]:
for webtoon in max_std_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    date_data = [episode["date"] for episode in data["episode"]]
    rating_data = [episode["rating"] for episode in data["episode"]]
    plt.plot(date_data, rating_data)
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()
for webtoon in min_std_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    date_data = [episode["date"] for episode in data["episode"]]
    rating_data = [episode["rating"] for episode in data["episode"]]
    plt.plot(date_data, rating_data)
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()

In [None]:
from scipy import stats

for data in collection.find():
    if data.get("episode") is not None:
        date = [episode["date"] for episode in data["episode"]]
        rating_data = [episode["rating"] for episode in data["episode"]]
        second_data = list(map(lambda x: x.timestamp() / 3600 / 24, date))
        result = stats.linregress(second_data, rating_data)._asdict()
        collection.update_one(
            {"title": data["title"]}, {"$set": {"analysis":result}}
        )

In [None]:
pprint(list(collection.find())[:10])

In [None]:
lin_rating_query = {"$project": {"_id": 0, "title": 1, "url": 1, "slope": "$analysis.slope"}}
inc_rating_webtoon = list(collection.aggregate([preprocess_query, lin_rating_query, {"$sort": {"slope": -1}}]))
dec_rating_webtoon = list(collection.aggregate([preprocess_query, lin_rating_query, {"$sort": {"slope": 1}}]))

In [None]:
pprint(inc_rating_webtoon[:10])
pprint(dec_rating_webtoon[:10])

In [None]:
for webtoon in inc_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    date_data = [episode["date"] for episode in data["episode"]]
    rating_data = [episode["rating"] for episode in data["episode"]]
    plt.plot(date_data, rating_data)
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()
for webtoon in dec_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    date_data = [episode["date"] for episode in data["episode"]]
    rating_data = [episode["rating"] for episode in data["episode"]]
    plt.plot(date_data, rating_data)
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()

In [None]:
r_squared_rating_query = {"$project": {"_id": 0, "title": 1, "url": 1, "r_squared": {"$pow":["$analysis.rvalue", 2]}}}
min_r_squared_rating_webtoon = list(
    collection.aggregate([preprocess_query, r_squared_rating_query, {"$sort": {"r_squared": 1}}, {"$limit": 10}])
)

In [None]:
pprint(min_r_squared_rating_webtoon[:10])

In [None]:
for webtoon in min_r_squared_rating_webtoon[:5]:
    data = collection.find_one({"title": webtoon["title"]})
    date_data = [episode["date"] for episode in data["episode"]]
    rating_data = [episode["rating"] for episode in data["episode"]]
    plt.plot(date_data, rating_data)
    plt.xticks(rotation=90)
    plt.title(data["title"])
    plt.show()

In [None]:
# all data to word cloud
wordcloud.insert_one({
    "max_mean_rating":max_rating_webtoon[:5],
    "min_mean_rating":min_rating_webtoon[:5],
    "max_std_rating":max_std_rating_webtoon[:5],
    "min_std_rating":min_std_rating_webtoon[:5],
    "inc_rating":inc_rating_webtoon[:5],
    "dec_rating":dec_rating_webtoon[:5],
    "min_r_squared_rating":min_r_squared_rating_webtoon[:5]
    }
)

In [None]:
wordcloud.update_one({},{
    "$set":{
    "max_mean_rating":max_rating_webtoon[:5],
    "min_mean_rating":min_rating_webtoon[:5],
    "max_std_rating":max_std_rating_webtoon[:5],
    "min_std_rating":min_std_rating_webtoon[:5],
    "inc_rating":inc_rating_webtoon[:5],
    "dec_rating":dec_rating_webtoon[:5],
    "min_r_squared_rating":min_r_squared_rating_webtoon[:5]
    }
})

In [None]:
pprint(list(wordcloud.find()))