In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

In [2]:
urls = {
    'Towards Data Science': 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
    'UX Collective': 'https://uxdesign.cc/archive/{0}/{1:02d}/{2:02d}',
    'The Startup': 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    'The Writing Cooperative': 'https://writingcooperative.com/archive/{0}/{1:02d}/{2:02d}',
    'Data Driven Investor': 'https://medium.com/datadriveninvestor/archive/{0}/{1:02d}/{2:02d}',
    'Better Humans': 'https://medium.com/better-humans/archive/{0}/{1:02d}/{2:02d}',
    'Better Marketing': 'https://medium.com/better-marketing/archive/{0}/{1:02d}/{2:02d}',
}

In [3]:
def is_leap(year):
    if year % 4 != 0:
        return False
    elif year % 100 != 0:
        return True
    elif year % 400 != 0:
        return False
    else:
        return True
    
def convert_day(day, year):
    month_days = [31, 29 if is_leap(year) else 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

def get_claps(claps_str):
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

def get_img(img_url, dest_folder, dest_filename):
    ext = img_url.split('.')[-1]
    if len(ext) > 4:
        ext = 'jpg'
    dest_filename = f'{dest_filename}.{ext}'
    with open(f'{dest_folder}/{dest_filename}', 'wb') as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

In [4]:
year = 2019
selected_days = random.sample([i for i in range(1, 367 if is_leap(year) else 366)], 50)

In [5]:
data = []
article_id = 0
i = 0
n = len(selected_days)
for d in selected_days:
    i += 1
    month, day = convert_day(d, year)
    date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
    print(f'{i} / {n} ; {date}')
    for publication, url in urls.items():
        response = requests.get(url.format(year, month, day), allow_redirects=True)
        if not response.url.startswith(url.format(year, month, day)):
            continue
        page = response.content
        soup = BeautifulSoup(page, 'html.parser')
        articles = soup.find_all("div", class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")
        for article in articles:
            title = article.find("h3", class_="graf--title")
            if title is None:
                continue
            title = title.contents[0]
            article_id += 1
            subtitle = article.find("h4", class_="graf--subtitle")
            subtitle = subtitle.contents[0] if subtitle is not None else ''
            image = article.find("img", class_="graf-image")
            image = '' if image is None else get_img(image['src'], 'images', f'{article_id}')
            article_url = article.find_all("a")[3]['href'].split('?')[0]
            claps = get_claps(article.find_all("button")[1].contents[0])
            reading_time = article.find("span", class_="readingTime")
            reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
            responses = article.find_all("a")
            if len(responses) == 7:
                responses = responses[6].contents[0].split(' ')
                if len(responses) == 0:
                    responses = 0
                else:
                    responses = responses[0]
            else:
                responses = 0

            data.append([article_id, article_url, title, subtitle, image, claps, responses, reading_time, publication, date])

1 / 50 ; 2019-05-30
2 / 50 ; 2019-07-25
3 / 50 ; 2019-03-30
4 / 50 ; 2019-07-18
5 / 50 ; 2019-03-17
6 / 50 ; 2019-04-07
7 / 50 ; 2019-12-30
8 / 50 ; 2019-04-22
9 / 50 ; 2019-10-14
10 / 50 ; 2019-09-11
11 / 50 ; 2019-08-11
12 / 50 ; 2019-11-09
13 / 50 ; 2019-05-02
14 / 50 ; 2019-10-28
15 / 50 ; 2019-11-01
16 / 50 ; 2019-07-06
17 / 50 ; 2019-09-17
18 / 50 ; 2019-12-27
19 / 50 ; 2019-02-23
20 / 50 ; 2019-10-11
21 / 50 ; 2019-07-29
22 / 50 ; 2019-05-27
23 / 50 ; 2019-11-08
24 / 50 ; 2019-09-18
25 / 50 ; 2019-04-25
26 / 50 ; 2019-03-25
27 / 50 ; 2019-05-09
28 / 50 ; 2019-05-12
29 / 50 ; 2019-07-21
30 / 50 ; 2019-08-08
31 / 50 ; 2019-05-24
32 / 50 ; 2019-04-30
33 / 50 ; 2019-01-26
34 / 50 ; 2019-10-23
35 / 50 ; 2019-05-22
36 / 50 ; 2019-05-20
37 / 50 ; 2019-12-20
38 / 50 ; 2019-05-26
39 / 50 ; 2019-03-11
40 / 50 ; 2019-06-04
41 / 50 ; 2019-03-14
42 / 50 ; 2019-02-18
43 / 50 ; 2019-04-01
44 / 50 ; 2019-02-09
45 / 50 ; 2019-07-28
46 / 50 ; 2019-06-08
47 / 50 ; 2019-03-18
48 / 50 ; 2019-10-21
4

In [6]:
medium_df = pd.DataFrame(data, columns=['id', 'url', 'title', 'subtitle', 'image', 'claps', 'responses', 'reading_time', 'publication', 'date'])

In [7]:
medium_df

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30
...,...,...,...,...,...,...,...,...,...,...
6503,6504,https://medium.com/better-marketing/we-vs-i-ho...,“We” vs “I” — How Should You Talk About Yourse...,Basic copywriting choices with a big…,6504.jpg,661,6,6,Better Marketing,2019-12-05
6504,6505,https://medium.com/better-marketing/how-donald...,How Donald Trump Markets Himself,Lessons from who might be the most popular bra...,6505.jpeg,189,1,5,Better Marketing,2019-12-05
6505,6506,https://medium.com/better-marketing/content-an...,Content and Marketing Beyond Mass Consumption,How to acquire customers without wasting money...,6506.jpg,207,1,8,Better Marketing,2019-12-05
6506,6507,https://medium.com/better-marketing/5-question...,5 Questions All Copywriters Should Ask Clients...,Save time and effort by…,6507.jpg,253,2,5,Better Marketing,2019-12-05


In [8]:
medium_df.to_csv('medium_data.csv', index=False)