In [1]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import numpy as np
import pandas as pd

## Data Scraping

In [2]:
def get_rating(link):
    page = urlopen(link)
    soup = BeautifulSoup(page, 'html.parser')
    score = soup.find("em", class_='cnt')
    return score.contents[0]

In [3]:
def get_chapter(link):
    page = urlopen(link)
    soup = BeautifulSoup(page, 'html.parser')
    chapter = soup.find("span", class_='tx')
    chapter = chapter.contents[0]
    return int(chapter.replace("#",''))

In [4]:
quote_page = 'https://www.webtoons.com/en/dailySchedule'

# query the website and return the html to the variable ‘page’
page = urlopen(quote_page)
soup = BeautifulSoup(page, 'html.parser')

# create csv
webtoon_data = csv.writer(open('webtoons_data.csv', 'w'))
webtoon_data.writerow(['Name', 'Heart', 'Score', 'Genre', 'Status', "Episode"])


everyday = np.array(["MONDAY","TUESDAY","WEDNESDAY","THURSDAY","FRIDAY","SATURDAY","SUNDAY","COMPLETED"])
for i in range(len(everyday)):
    if everyday[i] == "COMPLETED":
        update_find = "daily_lst comp"
    else: 
        update_find = "daily_section _list_" + everyday[i]
    d_day = soup.find(class_= update_find)
    name = d_day.find_all('p', class_ = 'subj')
    likes = d_day.find_all('em')
    score = d_day.find_all()
    genre = d_day.find_all('p', class_ = 'genre')
    webtoon_link = []
    for link in d_day.find_all('a'):
        if link.get('href') == "#":
            continue
        else:
            webtoon_link.append(link.get('href'))
    for n,l,g,s in zip(name, likes, genre, range(len(webtoon_link))):
        N = n.contents[0]
        L = l.contents[0]
        G = g.contents[0]
        scores = get_rating(webtoon_link[s])
        chapters = get_chapter(webtoon_link[s])
        webtoon_data.writerow([N,L,scores,G,everyday[i],chapters])  
        

## Using pandas

In [23]:
df = pd.read_csv('webtoons_data.csv')
df.loc[df['Name'] == "Yumi's Cells"]

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
69,Yumi's Cells,7.2M,9.74,Romance,WEDNESDAY,392
154,Yumi's Cells,7.2M,9.74,Romance,SATURDAY,392


In [20]:
len(df)

340

In [24]:
df

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
0,SubZero,2.4M,9.82,Romance,MONDAY,29
1,Edith,1.6M,9.76,Romance,MONDAY,26
2,Cursed Princess Club,1.3M,9.74,Comedy,MONDAY,26
3,Sweet Home,5.9M,9.84,Thriller,MONDAY,83
4,My Giant Nerd Boyfriend,20.9M,9.73,Slice of life,MONDAY,329
5,Ghost Wife,4M,9.67,Romance,MONDAY,80
6,Code Adam,183377,9.83,Action,MONDAY,7
7,Love Advice from the Great Duke of Hell,3.6M,9.78,Comedy,MONDAY,61
8,Tower of God,13.3M,9.83,Fantasy,MONDAY,436
9,The Four of Them,865630,9.60,Drama,MONDAY,27


In [27]:
#Delete repeated webtoons since some of them update more than once a week
df.drop_duplicates(subset = "Name", inplace = True)
df.to_csv('webtoons_data.csv', index = False) 

# Saving updated csv file
df = pd.read_csv('webtoons_data.csv')

#### What are the webtoons updated on Monday? How many of them have a score below the average score of all webtoons? What are they?

In [30]:
# To query webtoons updated on Monday
everyday = np.array(["MONDAY","TUESDAY","WEDNESDAY","THURSDAY","FRIDAY","SATURDAY","SUNDAY","COMPLETED"])
monday = df[df.Status == everyday[0]]
monday.head()

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
0,SubZero,2.4M,9.82,Romance,MONDAY,29
1,Edith,1.6M,9.76,Romance,MONDAY,26
2,Cursed Princess Club,1.3M,9.74,Comedy,MONDAY,26
3,Sweet Home,5.9M,9.84,Thriller,MONDAY,83
4,My Giant Nerd Boyfriend,20.9M,9.73,Slice of life,MONDAY,329


In [31]:
avg_score = df['Score'].mean()
avg_score 

9.357269624573377

In [32]:
# To query those with a score lower than average
below_avg = monday[monday['Score'] < avg_score]
below_avg

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
23,Messenger,383271,9.17,Fantasy,MONDAY,54


Sorting the dataset by Heart

In [33]:
# Change the column from a string to a float
def value_to_float(x):
    if "M" in x:
        return float(x.replace('M',''))*1000000
    if "," in x:
        return float(x.replace(',',''))
    
# Apply value_to_float function to the Heart column
df['Heart'] = df['Heart'].map(value_to_float)

In [34]:
# Sorting the table by Heart
sort_by_heart = df.sort_values('Heart', ascending = False)
sort_by_heart[:10] # Top ten webtoons with the most Hearts

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
4,My Giant Nerd Boyfriend,20900000.0,9.73,Slice of life,MONDAY,329
61,Bluechair,19700000.0,9.73,Comedy,WEDNESDAY,694
82,unOrdinary,17100000.0,9.83,Fantasy,THURSDAY,152
140,Lookism,15900000.0,9.83,Drama,SUNDAY,240
30,Winter Moon,14300000.0,9.44,Fantasy,TUESDAY,273
114,I Love Yoo,13600000.0,9.83,Romance,FRIDAY,96
8,Tower of God,13300000.0,9.83,Fantasy,MONDAY,436
29,Boyfriend of the Dead,13200000.0,9.78,Comedy,TUESDAY,202
116,Siren's Lament,11900000.0,9.76,Romance,SATURDAY,158
54,Let's Play,11100000.0,9.66,Romance,TUESDAY,80


In [35]:
sort_by_heart[-10:] # Last 10 webtoons with the least amount of Hearts

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
284,The Purple Heart,34197.0,9.09,Superhero,COMPLETED,26
74,Bloodless Wars,32321.0,9.24,Action,WEDNESDAY,10
177,The Brooklynite,23644.0,9.04,Superhero,COMPLETED,26
250,Just Ask Yuli,21067.0,7.56,Slice of life,COMPLETED,25
111,Acursian,18896.0,7.36,Fantasy,FRIDAY,9
126,Skate!!! Fire 100,17035.0,9.43,Fantasy,SATURDAY,3
113,FINALITY,15327.0,8.34,Thriller,FRIDAY,8
219,Epic V,13616.0,5.66,Comedy,COMPLETED,27
77,STARCROSS,8170.0,7.54,Superhero,WEDNESDAY,17
181,Cyko-KO,5340.0,7.56,Comedy,COMPLETED,26


#### What are the webtoon with less than 100k hearts? Which genre are more likely to get less than 100k of hearts?

In [36]:
low_heart = sort_by_heart[sort_by_heart['Heart'] < 100000]
low_heart.head()

Unnamed: 0,Name,Heart,Score,Genre,Status,Episode
132,Metaphorical HER,96584.0,8.78,Drama,SATURDAY,32
162,SHADOW,95367.0,9.41,Fantasy,COMPLETED,12
282,Cyberbunk,95110.0,8.56,Sci-fi,COMPLETED,194
265,Tickle Town,94340.0,6.61,Comedy,COMPLETED,180
109,Brothers Bond,93901.0,9.26,Action,FRIDAY,40


In [37]:
genres = df['Genre'].unique() # genres of webtoon
count_g = df['Genre'].nunique() # total number of genres
print("There are " + str(count_g) + ' genres of webtoons:')
print(genres)

There are 12 genres of webtoons:
['Romance' 'Comedy' 'Thriller' 'Slice of life' 'Action' 'Fantasy' 'Drama'
 'Sci-fi' 'Sports' 'Superhero' 'Informative' 'Horror']


In [38]:
heart_genre = low_heart['Genre'].value_counts()
heart_genre

Fantasy          8
Comedy           7
Drama            7
Slice of life    5
Superhero        5
Action           4
Sci-fi           4
Thriller         4
Name: Genre, dtype: int64

The genre Fantasy is the most likely to get less than 100k hearts. 