In [2]:
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
import json
from tqdm import tqdm

In [2]:
with open("data/href2dramapage.json", encoding="utf-8") as f:
    href2dramapage = json.load(f)

# Add details

In [3]:
with open("data/num2dramapage.json", encoding="utf-8") as f:
    num2dramapage = json.load(f)

In [4]:
def parse_details(k2dramapage: dict):

    k2details = defaultdict(dict)

    for k, page in tqdm(k2dramapage.items()):
        if k not in k2details:

            soup = BeautifulSoup(page)
            details = soup.find_all("ul", {"class": "list m-b-0"})
            if details:

                for details_block in details:
                    separate_details = details_block.find_all("li", {"class": "list-item p-a-0"})

                    for separate_detail in separate_details:
                        
                        try:
                            detail_prefix = separate_detail.find_all("b", {"class": "inline"})
                            assert len(detail_prefix) == 1

                            detail_key = detail_prefix[0].get_text()
                            detail_value = separate_detail.get_text()
                            assert detail_value.startswith(detail_key)

                            assert detail_key not in k2details[k]
                            k2details[k][detail_key] = detail_value
                            
                        except AssertionError:
                            print(k)
                        
    return k2details

In [27]:
href2details = parse_details(href2dramapage)

100%|████████████████████████████████████████████████████████████████████████████| 19462/19462 [30:23<00:00, 10.67it/s]


In [None]:
num2details = parse_details(num2dramapage)

  1%|▉                                                                             | 145/12079 [00:11<19:53, 10.00it/s]

In [28]:
# %%time
# with open("data/href2details.json", "w", encoding="utf-8") as fw:
#     json.dump(href2details, fw, ensure_ascii=False)

CPU times: total: 922 ms
Wall time: 956 ms


In [30]:
len(href2dramapage)

19462

In [32]:
Counter([k for d in href2details.values() for k in d.keys()]).most_common()

[('Country:', 19393),
 ('Score:', 19393),
 ('Ranked:', 19393),
 ('Popularity:', 19393),
 ('Watchers:', 19393),
 ('Favorites:', 19393),
 ('Duration:', 18055),
 ('Episodes:', 13028),
 ('Aired:', 12258),
 ('Original Network:', 10738),
 ('Aired On:', 10189),
 ('Drama:', 9735),
 ('Movie:', 6365),
 ('Release Date:', 6365),
 ('TV Show:', 2101),
 ('Special:', 1192),
 ('Airs:', 770),
 ('Airs On:', 47)]

In [3]:
with open("data/href2details.json", encoding="utf-8") as f:
    href2details = json.load(f)

In [4]:
import pandas as pd

df = pd.DataFrame({"details": href2details.values()})
df = pd.json_normalize(df["details"])
df["href"] = href2details.keys()

In [5]:
for col in df.columns:
    if col.endswith(":"):
        df[col] = df[col].apply(
            lambda x: (x[len(col):].strip() if (type(x) == str) else x)
        )

In [6]:
df["Watchers:"] = df["Watchers:"].apply(
    lambda x: int(x.replace(",", ""))
)

In [7]:
df["scorers"] = df["Score:"].apply(
    lambda x: int(x.split()[-2].replace(",", ""))
)
df["Score"] = df["Score:"].apply(
    lambda x: x.split()[0]
)

df["Score"] = df["Score"].apply(
    lambda x: None if (x == 'N/A') else float(x)
)

In [8]:
cols = ["Country:"]
df.groupby(cols).agg({
    "Watchers:": ["median", "mean"]
    , "Original Network:": ["nunique"]
    , "Score": ["mean", "median"]
}).sort_values(("Watchers:", "median"))

Unnamed: 0_level_0,Watchers:,Watchers:,Original Network:,Score,Score
Unnamed: 0_level_1,median,mean,nunique,mean,median
Country:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Philippines,114.5,556.15,9,7.44424,7.5
Hong Kong,202.0,390.447059,16,7.365303,7.5
Japan,350.0,1232.555932,80,7.424339,7.5
China,464.0,1940.104172,171,7.610948,7.6
Thailand,611.0,2523.874016,40,7.482297,7.5
Taiwan,621.0,2339.784257,59,7.394562,7.4
South Korea,771.0,5113.669519,137,7.731071,7.7


In [9]:
def duration_to_minutes(s: str) -> int:
    if type(s) != str:
        return s
    lst = s.split()
    if len(lst) == 2:
        return int(lst[0])
    if len(lst) == 4:
        return (int(lst[0]) * 60) + int(lst[2])
    return s

df["Duration"] = df["Duration:"].apply(duration_to_minutes)

In [10]:
df[
    df["Drama:"].notna()
].groupby("Country:").agg({"Duration": ["mean", "median"]}).sort_values(("Duration", "mean"))

Unnamed: 0_level_0,Duration,Duration
Unnamed: 0_level_1,mean,median
Country:,Unnamed: 1_level_2,Unnamed: 2_level_2
Philippines,28.649533,30.0
China,37.559842,45.0
Japan,39.161027,45.0
Hong Kong,44.080508,45.0
South Korea,47.269392,60.0
Taiwan,59.995327,60.0
Thailand,67.752657,60.0


In [11]:
df["Episodes"] = df["Episodes:"].apply(
    lambda x: int(x) if (type(x) == str) else x
)

In [12]:
df["total_duration"] = [
    t.Duration * t.Episodes
    if (t.Duration and t.Episodes)
    else None
    for t in df.itertuples()
]

In [13]:
df["Popularity"] = df["Popularity:"].apply(
    lambda x: int(x[1:]) if (x != "#99999") else None
)

In [14]:
df["Ranked"] = df["Ranked:"].apply(
    lambda x: int(x[1:]) if (x != "#99999") else None
)

In [15]:
df[["Score", "Duration", "Episodes", "total_duration", "scorers", "Popularity", "Ranked"]].corr()

Unnamed: 0,Score,Duration,Episodes,total_duration,scorers,Popularity,Ranked
Score,1.0,-0.078422,0.03844,0.025866,0.173525,-0.231818,-0.35268
Duration,-0.078422,1.0,0.023807,0.070518,-0.006726,0.020296,-0.017381
Episodes,0.03844,0.023807,1.0,0.886674,-0.01425,0.027139,0.024601
total_duration,0.025866,0.070518,0.886674,1.0,-0.005888,0.005096,0.010465
scorers,0.173525,-0.006726,-0.01425,-0.005888,1.0,-0.42235,-0.296139
Popularity,-0.231818,0.020296,0.027139,0.005096,-0.42235,1.0,0.749527
Ranked,-0.35268,-0.017381,0.024601,0.010465,-0.296139,0.749527,1.0


In [16]:
tmp = df.groupby("Country:").agg({
    col: "median" for col 
    in ["Score", "Duration", "Episodes", "total_duration", "scorers", "Popularity", "Ranked"]
})

tmp

Unnamed: 0_level_0,Score,Duration,Episodes,total_duration,scorers,Popularity,Ranked
Country:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
China,7.6,45.0,26.0,1200.0,70.0,6606.5,34464.0
Hong Kong,7.5,90.0,25.0,1125.0,85.0,10568.5,44142.5
Japan,7.5,55.0,10.0,360.0,110.0,7885.0,40286.0
Philippines,7.5,45.0,10.0,280.0,32.5,8984.5,40810.5
South Korea,7.7,65.0,12.0,560.0,227.0,5273.0,7526.0
Taiwan,7.4,70.0,15.0,1163.0,168.0,6711.0,8522.0
Thailand,7.5,60.0,12.0,782.0,131.0,6321.0,34008.0


In [18]:
df["Aired On"] = df["Aired On:"].apply(
    lambda x: ", ".join(sorted(x.split(", "))) if (type(x) == str) else x
)

In [21]:
def aired_on_monday(s: str) -> int:
    if type(s) == str:
        return int("Monday" in s)
    return s

def aired_on_tuesday(s: str) -> int:
    if type(s) == str:
        return int("Tuesday" in s)
    return s

def aired_on_each_weekday(s: str) -> int:
    if type(s) == str:
        weekdays = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
        return all([wd in s for wd in weekdays])
    return s

df["aired_on_monday"] = df["Aired On:"].apply(aired_on_monday)
df["aired_on_tuesday"] = df["Aired On:"].apply(aired_on_tuesday)
df["aired_on_each_weekday"] = df["Aired On:"].apply(aired_on_each_weekday)

df.groupby("Country:").agg({
    "Aired On:": "count"
    , "aired_on_monday": ["mean",]
    , "aired_on_tuesday": ["mean"]
    , "aired_on_each_weekday": ["mean"]
}).round(2)

Unnamed: 0_level_0,Aired On:,aired_on_monday,aired_on_tuesday,aired_on_each_weekday
Unnamed: 0_level_1,count,mean,mean,mean
Country:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
China,1990,0.53,0.57,0.28
Hong Kong,223,0.87,0.85,0.05
Japan,2857,0.13,0.14,0.0
Philippines,187,0.44,0.38,0.01
South Korea,3356,0.26,0.28,0.01
Taiwan,413,0.21,0.22,0.02
Thailand,1163,0.29,0.29,0.01


In [24]:
tmp_set = set(df["Aired On"].value_counts().head(11).index)

df[df["Aired On"].apply(
    lambda x: x in tmp_set
) & df["Country:"].apply(
    lambda x: x in {"China", "South Korea"}
)].groupby(["Aired On", "Country:"]).agg({
     "Score": ["mean", "count"]
}).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Score,Score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
Aired On,Country:,Unnamed: 2_level_2,Unnamed: 3_level_2
Friday,China,7.94,126
Friday,South Korea,8.01,471
"Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday",China,7.68,548
"Friday, Monday, Saturday, Sunday, Thursday, Tuesday, Wednesday",South Korea,8.41,27
"Friday, Monday, Thursday, Tuesday, Wednesday",China,7.6,74
"Friday, Monday, Thursday, Tuesday, Wednesday",South Korea,7.37,186
Monday,China,7.53,45
Monday,South Korea,8.04,130
"Monday, Tuesday",China,7.68,58
"Monday, Tuesday",South Korea,7.66,415
