In [1]:
import requests
import re
import pandas as pd
import time
import glob
from bs4 import BeautifulSoup

In [2]:
def get_ratings(main_panel):
    try:
        gen_info = main_panel.find(attrs={"itempropx": "aggregateRating"}).parent.find_all("div",class_="hfs")
    except:
        gen_info = main_panel.find(attrs={"itempropx": "ratingValue"}).parent.parent.find_all("div",class_="hfs")
    
    # ratings: 0.0/10 from 0000 users
    rating_info = gen_info[0].getText()
    rating_info = rating_info.split(" ")
    
    rating = rating_info[1].split("/")[0]
    num_ratings = int(rating_info[3].replace(",",""))
    
    # # of watchers: 0000
    num_watchers = gen_info[1].getText()
    num_watchers = int(num_watchers.split(":")[1].strip().replace(",",""))
    
    return (rating, num_ratings, num_watchers)

In [3]:
def get_genres(main_panel):
    # genres: xxx, xxx, xxx
    genres_rough = main_panel.find("li",class_="show-genres").find_all("a")
    genres = {rough.getText() for rough in genres_rough}
    
    return genres

In [4]:
def get_tags(main_panel):
    # tags: xxx, xxx, xxx
    tags_rough = main_panel.find("li",class_="show-tags").find_all("a")
    tags = {rough.getText() for rough in tags_rough[:-1]}
    
    return tags

In [5]:
def get_cast(drama_url):
    cast_url = drama_url + "/cast"
    cast_page = requests.get(cast_url)
    cast_soup = BeautifulSoup(cast_page.content,"html.parser")
    
    cast_info = cast_soup.find("h3",class_="header").parent
    
    main_cast = set()
    for cast_memb in cast_info.find_all("small",class_="text-muted"):
        if cast_memb.getText() == "Main Role":
            main_cast.add(cast_memb.parent.find("a").get("href"))
    
    crew = set()
    for crew_memb in cast_info.find_all("a",class_="text-primary text-ellipsis"):
        crew.add(crew_memb.get("href"))
            
    return (main_cast,crew)

In [6]:
def get_reviews(drama_url):
    review_url = drama_url + "/reviews?xlang=en-US"
    review_page = requests.get(review_url)
    review_soup = BeautifulSoup(review_page.content,"html.parser")
    
    rv_bodies = []
    
    while True:
        reviews = review_soup.find_all("div",class_="review-body")

        for review in reviews:
            rv_body = [str(a.string).strip() for a in review.children][2:-4]
            rv_body = " ".join(rv_body).strip()
            rv_bodies.append(rv_body)
        
        try:
            next_rv_url = "https://mydramalist.com" + review_soup.find("li",class_="page-item next").find("a").get("href")
            review_page = requests.get(next_rv_url)
            review_soup = BeautifulSoup(review_page.content,"html.parser")
            print("fetching",next_rv_url[-6:])
        except: break
    
    return rv_bodies

In [7]:
def get_dramainfo(slug):
    drama_url = "https://mydramalist.com" + slug
    page = requests.get(drama_url)
    soup = BeautifulSoup(page.content,"html.parser")
    
    print("FETCHING FOR ..", drama_url)
    
    # TITLE, YEAR INFO
    title_info = soup.find("h1", class_="film-title").getText()
    title_info = title_info.split("(")
    title = title_info[0]
    year = title_info[1][:-1]
    
    # MAIN BOX
    main_panel = soup.find("div", id="show-detailsxx")

    # SYNOPSIS TEXT
    synopsis = main_panel.find("div", class_="show-synopsis").find("span").getText()
    
    # FROM MAIN PANEL INFO
    (rating, num_ratings, num_watchers) = get_ratings(main_panel)
    try: genres = get_genres(main_panel)
    except: genres = set()
    try: tags = get_tags(main_panel)
    except: tags = set()
    
    # AIR INFO FROM SIDEBOX
    air_info = soup.find("div",class_="box-body light-b").find_all("li")
    air_info = [str(i.getText()) for i in air_info]
    air_dict = {"Drama":None,"Country":None,"Episodes":None,"Aired":None,
                "Aired On":None,"Original Network":set(),"Duration":None,
                "Content Rating":None}
    for item in air_info:
        sep = item.split(":")
        (key,val) = (sep[0].strip(),sep[1].strip())
        air_dict[key]=val
    try:
        air_dict["Original Network"] = set(ntwk.strip() for ntwk in air_dict["Original Network"].split(","))
    except: pass
    
    # CAST & CREW, REVIEWS
    try: (main_cast,crew) = get_cast(drama_url)
    except: (main_cast,crew) = (set(),set())
    reviews = get_reviews(drama_url)
    
    drama_dict = {"id":slug,"title":title,"year":year,"synopsis":synopsis,"rating":rating,
                  "num_ratings":num_ratings,"num_watchers":num_watchers,"genres":genres,
                  "tags":tags,"crew":crew,"country":air_dict["Country"],
                  "episodes":air_dict["Episodes"],"orig_network":air_dict["Original Network"],
                  "ep_duration":air_dict["Duration"],"main_cast":main_cast,"reviews":reviews}
    return drama_dict

In [8]:
def run_by_year(filename):
    print("* GETTING INFO FOR FILE ..",filename,"*\n")
    
    with open(filename, "r") as f:
        slugs = f.read().splitlines()
    
    dramainfo = pd.DataFrame()

    for slug in slugs:
        drama_dict = get_dramainfo(slug)
        dramainfo = dramainfo.append(drama_dict,ignore_index=True)
    
    return dramainfo

In [218]:
start_time = time.perf_counter()
dramainfo_2020 = run_by_year('2020_slugs.txt')
end_time = time.perf_counter()

print("\nRUNTIME: {:n} min {:.2f} s".format(int((end_time-start_time)/60),
                                          (end_time-start_time)%60))

* GETTING INFO FOR FILE .. 2020_slugs.txt *

FETCHING FOR .. https://mydramalist.com/54563-breaking-point-1950
FETCHING FOR .. https://mydramalist.com/50027-rebirth-of-shopping-addict
FETCHING FOR .. https://mydramalist.com/49239-detective-chinatown
FETCHING FOR .. https://mydramalist.com/60529-cometesting
FETCHING FOR .. https://mydramalist.com/60371-seua-chanee-gayng-season-5
FETCHING FOR .. https://mydramalist.com/53205-dim-light-in-the-shadow
FETCHING FOR .. https://mydramalist.com/56937-lovers-at-the-palace
FETCHING FOR .. https://mydramalist.com/54283-sink-or-swim
FETCHING FOR .. https://mydramalist.com/39743-touch
fetching page=2
FETCHING FOR .. https://mydramalist.com/52651-being-a-fake-man-is-tough
FETCHING FOR .. https://mydramalist.com/52617-sedai-wars
FETCHING FOR .. https://mydramalist.com/36261-love-me-do-you-dare
FETCHING FOR .. https://mydramalist.com/52619-tonight-is-u-shaped
FETCHING FOR .. https://mydramalist.com/52029-zettai-reido-4
FETCHING FOR .. https://mydramali

In [221]:
# 2020
# ~1200 objects, ~15 minutes
dramainfo_2020.to_json("2020_dramainfo.json")
dramainfo_2020.head()

Unnamed: 0,country,crew,ep_duration,episodes,genres,id,main_cast,num_ratings,num_watchers,orig_network,rating,reviews,synopsis,tags,title,year
0,China,{},45 min.,46,{Thriller},/54563-breaking-point-1950,"{/people/17397-zhao-si-mei, /people/17079-pu-m...",2.0,50.0,"{iQiyi, Tencent Video, CCTV}",10.0,[],"During the Korean War in the 1950's, Communist...","{Suspense, Spy, Chinese Republican Era}",Breaking Point 1950,2020
1,China,{/people/55679-peng-xue-jun},45 min.,44,"{Comedy, Drama, Business, Romance}",/50027-rebirth-of-shopping-addict,"{/people/18860-li-shen, /people/14044-meng-zoe...",260.0,1202.0,"{Mango TV, Hunan TV}",7.6,"[This was an enjoyable, low-stress modern dram...",A story about a shopaholic who goes on the cou...,"{Workaholic, Shopaholic, Online Shopping, Jeal...",Rebirth of Shopping Addict,2020
2,China,"{/people/14287-dai-mo, /people/54197-shang-na,...",50 min.,12,"{Action, Comedy, Thriller, Mystery}",/49239-detective-chinatown,"{/people/653-chang-janine, /people/14103-zhang...",353.0,1287.0,{iQiyi},7.8,[This review may contain spoilers Mystery or ...,Strange crimes occur in Thailand as the rankin...,"{Eccentric Male Lead, Smart Male Lead, Investi...",Detective Chinatown,2020
3,Taiwan,{},18 min.,8,{Romance},/60529-cometesting,"{/people/37831-dora-hsieh, /people/42681-harry...",7.0,27.0,{},7.6,[],*There are 2 versions of the finale (Episode 8...,{Miniseries},Cometesting,2020
4,Thailand,{},45 min.,46,"{Sitcom, Comedy, Romance}",/60371-seua-chanee-gayng-season-5,"{/people/8846-chaiyaat-tanatat, /people/9903-r...",3.0,21.0,{GMM One},8.5,[],,{},Seua Chanee Gayng 5,2020


In [239]:
# assuming it takes about 15 minutes to run each year
# the next part should take ~2.5hr

In [9]:
filedump = sorted(glob.glob("*_slugs.txt"))

In [244]:
# 2010-2019 inclusive
yearset = filedump[10:-1]

In [250]:
for filename in yearset:
    dramainfo = run_by_year(filename)
    dramainfo.to_json("{}_dramainfo.json".format(filename[:4]))

* GETTING INFO FOR FILE .. 2010_slugs.txt *

FETCHING FOR .. https://mydramalist.com/65787-no-1-landmark
FETCHING FOR .. https://mydramalist.com/61351-home-with-aliens-2
FETCHING FOR .. https://mydramalist.com/51069-west-gate-love-song
FETCHING FOR .. https://mydramalist.com/39877-the-legend-of-lian-shi
FETCHING FOR .. https://mydramalist.com/15147-romantic-movement-in-seoul
FETCHING FOR .. https://mydramalist.com/698381-going-south
FETCHING FOR .. https://mydramalist.com/3672-the-myth-2010
FETCHING FOR .. https://mydramalist.com/3491-pretty-maid
FETCHING FOR .. https://mydramalist.com/1220-the-reputable-family
FETCHING FOR .. https://mydramalist.com/532-ryoma-den
FETCHING FOR .. https://mydramalist.com/700517-cat-taxi
FETCHING FOR .. https://mydramalist.com/796-jejoongwon
FETCHING FOR .. https://mydramalist.com/90-stars-falling-from-the-sky
fetching page=2
FETCHING FOR .. https://mydramalist.com/55-master-of-study
FETCHING FOR .. https://mydramalist.com/46-pasta
fetching page=2
fetchi

In [17]:
# 2010-2019 actually only took a over 1 hr

In [12]:
for filename in filedump[:10]:
    dramainfo = run_by_year(filename)
    dramainfo.to_json("{}_dramainfo.json".format(filename[:4]))

* GETTING INFO FOR FILE .. 2000_slugs.txt *

FETCHING FOR .. https://mydramalist.com/703791-hua-jai-yang-yahk-mee-rak
FETCHING FOR .. https://mydramalist.com/69627-fai-ruk-fai-pitsawat
FETCHING FOR .. https://mydramalist.com/56393-the-chinse-hero
FETCHING FOR .. https://mydramalist.com/52199-deception
FETCHING FOR .. https://mydramalist.com/51823-king-of-beggars-su-can
FETCHING FOR .. https://mydramalist.com/51331-legendary-li-cui-lian
FETCHING FOR .. https://mydramalist.com/39213-sunny-piggy
FETCHING FOR .. https://mydramalist.com/39185-return-of-justice-bao
FETCHING FOR .. https://mydramalist.com/32088-plerng-ruk-fai-kaen
FETCHING FOR .. https://mydramalist.com/31061-ruk-chun-nun-per-tur
FETCHING FOR .. https://mydramalist.com/30326-mon-jantra
FETCHING FOR .. https://mydramalist.com/30192-payong
FETCHING FOR .. https://mydramalist.com/29746-wela-nai-kued-kaew
FETCHING FOR .. https://mydramalist.com/25239-the-peach-blossom-fan-legend
FETCHING FOR .. https://mydramalist.com/25192-xin-w

In [2]:
import json

In [3]:
jsonfiles = sorted(glob.glob("*.json"))

json_dfs = []

for file in jsonfiles:
    json_dfs.append(pd.read_json(file))

In [4]:
info_df = pd.concat(json_dfs, ignore_index=True)

In [44]:
print(info_df.shape)
info_df.head()

(12221, 16)


Unnamed: 0,country,crew,ep_duration,episodes,genres,id,main_cast,num_ratings,num_watchers,orig_network,rating,reviews,synopsis,tags,title,year
0,Thailand,[/people/16060-sukramongkol-krit],,1,"[Drama, Romance]",/703791-hua-jai-yang-yahk-mee-rak,"[/people/9404-plengpanich-chatchai, /people/10...",1,3,[Channel 3],6.0,[],Pink is a recent divorced woman from Sara's fa...,[],Hua Jai Yang Yahk Mee Rak,2000
1,Thailand,[],60 min.,1,"[Drama, Romance]",/69627-fai-ruk-fai-pitsawat,"[/people/17978-chadaporn-rattanakorn, /people/...",1,3,[Channel 5],5.0,[],,"[Betrayal, Obsession]",Fai Ruk Fai Pitsawat,2000
2,China,[],46 min.,28,[Action],/56393-the-chinse-hero,[/people/3117-shao-bing],1,5,"[Tencent Video, iQiyi, Sohu TV, LeTV]",8.0,[],,[],The Chinese Hero,2000
3,China,[],40 min.,32,[Drama],/52199-deception,"[/people/13200-li-zong-han, /people/5918-liu-r...",1,3,[],4.0,[],,[Deception],Deception,2000
4,Taiwan,[],47 min.,20,"[Comedy, Martial Arts]",/51823-king-of-beggars-su-can,"[/people/16558-timmy-ho, /people/6946-tse-gard...",2,6,[Youku],6.0,[],,[],King of Beggars: Su Can,2000


In [43]:
info_df[info_df["reviews"].apply(lambda x: len(x)>0)]

Unnamed: 0,country,crew,ep_duration,episodes,genres,id,main_cast,num_ratings,num_watchers,orig_network,rating,reviews,synopsis,tags,title,year
26,South Korea,"[/people/16179-kim-in-young, /people/67395-par...",1 hr. 10 min.,16,"[Drama, Romance, Family]",/2868-honesty,"[/people/557-choi-ji-woo, /people/1462-son-ji-...",117,344,[MBC],7.5,[I really liked this drama when I watched it. ...,Lee Ja Young comes from a poor family who live...,"[Poor Female Lead, Rich Female Lead]",Truth,2000
31,Japan,"[/people/15966-ozaki-masaya, /people/16318-tae...",45 min.,11,"[Drama, Romance, Crime]",/2695-nisennen-no-koi,"[/people/1426-nakayama-miho, /people/518-kanes...",83,278,[Fuji TV],7.0,[This is a somewhat old Japanese drama that ma...,Yuri Maroev is a secret agent and assassin fro...,"[Strong Male Lead, Unhealthy Mains' Relationsh...",Nisennen no Koi,2000
42,Japan,"[/people/61933-ueda-hiroki, /people/15981-kita...",54 min.,11,"[Drama, Romance]",/782-beautiful-life,"[/people/1133-tokiwa-takako, /people/237-kimur...",1329,2812,[TBS],7.9,[To be loved like this! I'm trying to imagine ...,"Kyoko, a young woman with an unhindered spirit...","[Secondary Couple, Personal Growth, Terminal I...",Beautiful Life,2000
45,Thailand,"[/people/76819-phanom-thian, /people/15733-yam...",2 hr. 0 min.,11,"[Drama, Romance]",/13492-massaya,"[/people/7331-gregson-andrew, /people/9902-for...",71,174,[Channel 7],7.4,[A lot of people are leaving comments regardin...,Massaya's father was kicked out (along with hi...,"[Army Officer, Birth Secret, Tomboy, Older Man...",Massaya,2000
47,Japan,"[/people/30253-nagaishi-takao, /people/16473-o...",24 min.,49,"[Martial Arts, Action, Sci-Fi, Tokusatsu]",/3409-kamen-rider-kuuga,[/people/626-odagiri-joe],623,1333,[TV Asahi],7.9,[So This is my first official review for a Dra...,"Long ago, the Gurongi Tribe terrorized the Lin...","[Superhero, Sibling Relationship, Death Game, ...",Kamen Rider Kuuga,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12216,China,[],12 min.,12,"[Comedy, Fantasy, Romance]",/684457-irresistible-love,"[/people/59919-nai-yi-wen, /people/36999-cui-s...",56,534,[Mango TV],7.3,[Short and sweet If you're someone like me str...,As soon as Xia Tian Tian turned school bully G...,"[Web Series, Body Swap, Single Mother, Mother-...",Irresistible Love,2020
12217,China,"[/people/46465-fang-fang, /people/52077-zhang-...",30 min.,24,"[Historical, Wuxia, Mystery]",/684465-wu-lin-mi-tan-zhi-mei-ren-tu-jian,"[/people/14267-zhang-nan, /people/7700-cai-jun...",13,238,[Tencent Video],6.7,[Worst Wuxia Drama of The Year The production ...,Young noble Chu Yun Xiao crosses paths with fe...,"[Lying, Smart Male Lead, Investigation, Family...",Unsolved Cases of Kung Fu: Portrait of Beauty,2020
12218,South Korea,[],16 min.,6,"[Comedy, Romance]",/683713-life-boys,"[/people/4067-park-so-young, /people/56217-kim...",138,396,[Naver TV Cast],7.5,[charming but lacking This webdrama stood out ...,"Jin Nam Joo gets dumped by her boyfriend, whom...","[Fortune Telling, Reverse-Harem, Web Series, F...",The Man of My Life,2020
12219,Philippines,"[/people/59789-perci-intalan, /people/20217-ju...",30 min.,13,"[Drama, Romance]",/683583-gameboys-level-up-edition,"[/people/43303-elijah-canlas, /people/42395-ko...",692,1557,[Netflix],8.4,[Most Accurate BL Show I've Seen After having ...,Teenage streamer Cairo is caught off guard whe...,"[Gay Character, COVID-19, Fluffy, Internet Rom...",Gameboys Level-Up Edition,2020
