In [1]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import numpy as np
import pandas as pd

## Data Scraping

In [2]:
def get_rating(link):
    page = urlopen(link)
    soup = BeautifulSoup(page, 'html.parser')
    score = soup.find("em", class_='cnt')
    return score.contents[0]

In [3]:
def get_chapter(link):
    page = urlopen(link)
    soup = BeautifulSoup(page, 'html.parser')
    chapter = soup.find("span", class_='tx')
    chapter = chapter.contents[0]
    return int(chapter.replace("#",''))

In [None]:
quote_page = 'https://www.webtoons.com/en/dailySchedule'

# query the website and return the html to the variable ‘page’
page = urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')

# create csv
webtoon_data = csv.writer(open('webtoons_data.csv', 'w'))
webtoon_data.writerow(['Name', 'Heart', 'Score', 'Genre', 'Status', "Episode"])


everyday = np.array(["MONDAY","TUESDAY","WEDNESDAY","THURSDAY","FRIDAY","SATURDAY","SUNDAY","COMPLETED"])
for i in range(len(everyday)):
    if everyday[i] == "COMPLETED":
        update_find = "daily_lst comp"
    else: 
        update_find = "daily_section _list_" + everyday[i]
    d_day = soup.find(class_= update_find)
    name = d_day.find_all('p', class_ = 'subj')
    likes = d_day.find_all('em')
    score = d_day.find_all()
    genre = d_day.find_all('p', class_ = 'genre')
    webtoon_link = []
    for link in d_day.find_all('a'):
        if link.get('href') == "#":
            continue
        else:
            webtoon_link.append(link.get('href'))
    for n,l,g,s in zip(name, likes, genre, range(len(webtoon_link))):
        N = n.contents[0]
        L = l.contents[0]
        G = g.contents[0]
        scores = get_rating(webtoon_link[s])
        chapters = get_chapter(webtoon_link[s])
        webtoon_data.writerow([N,L,scores,G,everyday[i],chapters])  
        

## Using pandas

In [32]:
df = pd.read_csv('webtoons_data.csv')

# Delete repeated webtoons since some update more than once a week
df.drop_duplicates(subset = "Name", inplace = True)

In [29]:
df

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
0,Edith,1.5M,9.77,Romance,MONDAY,25
1,SubZero,2.3M,9.82,Romance,MONDAY,28
2,Cursed Princess Club,1.2M,9.74,Comedy,MONDAY,25
3,Sweet Home,5.8M,9.84,Thriller,MONDAY,82
4,My Giant Nerd Boyfriend,20.7M,9.73,Slice of life,MONDAY,327
5,Ghost Wife,3.9M,9.67,Romance,MONDAY,78
6,Tower of God,13.2M,9.83,Fantasy,MONDAY,435
7,Code Adam,154586,9.83,Action,MONDAY,6
8,Love Advice from the Great Duke of Hell,3.5M,9.78,Comedy,MONDAY,60
9,The Four of Them,857800,9.60,Drama,MONDAY,27


What are the webtoons updated on Monday? How many of them have a score below the average score of all webtoons? What are they?

In [13]:
# To query webtoons updated on Monday
monday = df[df.Status == everyday[0]]
monday.head()

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
0,Edith,1.5M,9.77,Romance,MONDAY,25
1,SubZero,2.3M,9.82,Romance,MONDAY,28
2,Cursed Princess Club,1.2M,9.74,Comedy,MONDAY,25
3,Sweet Home,5.8M,9.84,Thriller,MONDAY,82
4,My Giant Nerd Boyfriend,20.7M,9.73,Slice of life,MONDAY,327


In [14]:
avg_score = df['Score'].mean()
avg_score 

9.34351612903226

In [15]:
# To query those with a score lower than average
below_avg = monday[monday['Score'] < avg_score]
below_avg

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
23,Messenger,376973,9.18,Fantasy,MONDAY,53


Sorting the dataset by Heart

In [16]:
# Change the column from a string to a float
def value_to_float(x):
    if "M" in x:
        return float(x.replace('M',''))*1000000
    if "," in x:
        return float(x.replace(',',''))
    
# Apply value_to_float function to the Heart column
df['Heart'] = df['Heart'].map(value_to_float)

In [17]:
# Sorting the table by Heart
sort_by_heart = df.sort_values('Heart', ascending = False)
sort_by_heart[:10] # Top ten webtoons with the most Hearts

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
4,My Giant Nerd Boyfriend,20700000.0,9.73,Slice of life,MONDAY,327
62,Bluechair,19600000.0,9.73,Comedy,WEDNESDAY,692
86,unOrdinary,16800000.0,9.83,Fantasy,THURSDAY,151
174,Lookism,15800000.0,9.83,Drama,SUNDAY,239
30,Winter Moon,14100000.0,9.44,Fantasy,TUESDAY,270
137,I Love Yoo,13500000.0,9.83,Romance,FRIDAY,96
6,Tower of God,13200000.0,9.83,Fantasy,MONDAY,435
29,Boyfriend of the Dead,13100000.0,9.78,Comedy,TUESDAY,200
140,Siren's Lament,11800000.0,9.76,Romance,SATURDAY,157
54,Let's Play,11100000.0,9.66,Romance,TUESDAY,80


In [18]:
sort_by_heart[-10:] # Last 10 webtoons with the least amount of Hearts

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
77,Bloodless Wars,31343.0,9.25,Action,WEDNESDAY,10
338,War Cry,30343.0,7.18,Superhero,COMPLETED,26
221,The Brooklynite,23594.0,9.04,Superhero,COMPLETED,26
294,Just Ask Yuli,21004.0,7.56,Slice of life,COMPLETED,25
134,Acursian,18854.0,7.37,Fantasy,FRIDAY,9
136,FINALITY,15288.0,8.34,Thriller,FRIDAY,8
350,City of Walls,14042.0,7.44,Action,COMPLETED,59
263,Epic V,13603.0,5.67,Comedy,COMPLETED,27
81,STARCROSS,7955.0,7.56,Superhero,WEDNESDAY,17
225,Cyko-KO,5279.0,7.58,Comedy,COMPLETED,26


What are the webtoon with less than 100k hearts? Which genre are more likely to get less than 100k of hearts?

In [35]:
low_heart = sort_by_heart[sort_by_heart['Heart'] < 100000]
low_heart

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
165,Metaphorical HER,96236.0,8.78,Drama,SATURDAY,32
206,SHADOW,94956.0,9.41,Fantasy,COMPLETED,12
326,Cyberbunk,94628.0,8.56,Sci-fi,COMPLETED,194
309,Tickle Town,94207.0,6.6,Comedy,COMPLETED,180
130,Brothers Bond,92098.0,9.25,Action,FRIDAY,39
91,Brass & Sass,89524.0,9.53,Romance,THURSDAY,5
303,Inarime,86667.0,8.82,Fantasy,COMPLETED,71
346,Cherry Blossoms,85695.0,9.2,Romance,COMPLETED,7
232,Spirits,85400.0,9.06,Fantasy,COMPLETED,35
278,Midnight Rhapsody,85250.0,9.15,Slice of life,COMPLETED,38


In [36]:
genres = df['Genre'].unique() # genres of webtoon
count_g = df['Genre'].nunique() # total number of genres
print("There are " + str(count_g) + ' genres of webtoons:')
print(genres)

There are 12 genres of webtoons:
['Romance' 'Comedy' 'Thriller' 'Slice of life' 'Fantasy' 'Action' 'Drama'
 'Sci-fi' 'Sports' 'Superhero' 'Informative' 'Horror']


In [38]:
heart_genre = low_heart['Genre'].value_counts()
heart_genre

Fantasy          9
Comedy           7
Drama            7
Superhero        6
Action           5
Slice of life    5
Thriller         4
Sci-fi           4
Romance          2
Name: Genre, dtype: int64

The genre Fantasy is the most likely to get less than 100k hearts. 

## Using SQL