# Medium Articles

This notebook web scrapes medium articles for key statistics. It can pull data for a specific year, and select publishers. The information obtained will output a dataframe including the columns below.

- id -- unique ID number for each article
- date -- the date the article was published
- url -- link to the articles webpage
- title -- the main title for the article (aka. Header 1)
- subtitle -- the sub header for the article (aka. Header 2)
- author_username -- the authors Meidum username referenced without `@` prefix
- claps -- the number of claps received for the article
- responses -- the number of responses received for the article
- reading_time -- the number of minutes it takes to read the entire article
- publication -- the main publication used by the writer for the article


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

### Function to format days in month

In [None]:
def convert_day(day):
    """ 
    Formating month by number of days
    """
    
    month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)

### Function to obtain and convert claps

In [None]:
def get_claps(claps_str):
    """
    Formating the claps when pulled from medium.
    Medium record claps abbreviated with a K for thousands"""
    
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None):
        return 0
    split = claps_str.split('K')
    claps = float(split[0])
    claps = int(claps*1000) if len(split) == 2 else int(claps)
    return claps

### Function to scrape article for information

In [None]:
def get_article(year, days, urls):
    """
    Function to pull medium artciles for specific urls and dates
    
    Arguments:
    year -- the year you would like to pull articles from.
    days -- the number of random days you would like articles pulled from.
    urls -- {dictionary} listing the name of the publication and source for article archives.
    
    """
    global medium_df
    
    # function variables
    data = []
    selected_days = random.sample([i for i in range(1, 366)], days)
    article_id = 0
    year = year
    i = 0
    n = len(selected_days)

    for d in selected_days:

        # formatting each url passing through
        i += 1
        month, day = convert_day(d)
        date = '{0}-{1:02d}-{2:02d}'.format(year, month, day)
        print(f'Downloading {i} / {n} : {date}')
        for publication, url in urls.items():
            response = requests.get(url.format(year, month, day), allow_redirects=True)

            if not response.url.startswith(url.format(year, month, day)):
                continue

            # setting soup variables
            page = response.content
            soup = BeautifulSoup(page, 'html.parser')
            articles = soup.find_all(
                "div",
                class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls")

            # saving variables from articles
            for article in articles:
                title = article.find("h3", class_="graf--title")
                if title is None:
                    continue

                title = title.contents[0]
                article_id += 1
                author_box = article.find('div', class_='postMetaInline u-floatLeft u-sm-maxWidthFullWidth')
                author_url = author_box.find('a')['href'] 
                author_username = author_url.split('@')[1].strip().lower()
                subtitle = article.find("h4", class_="graf--subtitle")
                subtitle = subtitle.contents[0] if subtitle is not None else ''
                article_url = article.find_all("a")[3]['href'].split('?')[0]
                claps = get_claps(article.find_all("button")[1].contents[0])
                reading_time = article.find("span", class_="readingTime")
                reading_time = 0 if reading_time is None else int(reading_time['title'].split(' ')[0])
                responses = article.find_all("a")

                if len(responses) == 7:
                    responses = responses[6].contents[0].split(' ')
                    if len(responses) == 0:
                        responses = 0
                    else:
                        responses = responses[0]
                else:
                    responses = 0


                data.append([article_id, date, article_url, title,
                             subtitle, author_username, claps, responses,
                             reading_time, publication])
                
    medium_df = pd.DataFrame(data, columns=[
                                            'id', 'date', 'url', 'title', 'subtitle',
                                            'author_username', 'claps', 'responses',
                                            'reading_time', 'publication']).set_index('id')
    
    medium_df.to_csv('medium_articles.csv')
    

### Inputs for the Function

You can enter a dictionary with a list of the publisher's name and formatted link to obtain the articles in their archive.

**Note:** Entering a large number for days will result in a long run time.

In [None]:
# dictionary of urls and publications from medium to parse
urls = {
    'Towards Data Science'    : 'https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}',
    'Analytics Vidhya'        : 'https://medium.com/analytics-vidhya/archive/{0}/{1:02d}/{2:02d}',
    'Data Insight Action'     : 'https://medium.com/data-insights-action/archive/{0}/{1:02d}/{2:02d}',
    'DataSeries'              : 'https://medium.com/dataseries/archive/{0}/{1:02d}/{2:02d}',
    'Ocean Protocol'          : 'https://blog.oceanprotocol.com/archive/{0}/{1:02d}/{2:02d}',
    'Hacking Analyics'        : 'https://medium.com/analytics-and-data/archive/{0}/{1:02d}/{2:02d}',
    'Nightingale'             : 'https://medium.com/nightingale/archive/{0}/{1:02d}/{2:02d}',
    'The Startup'             : 'https://medium.com/swlh/archive/{0}/{1:02d}/{2:02d}',
    'Data Driven Investor'    : 'https://medium.com/datadriveninvestor/archive/{0}/{1:02d}/{2:02d}',
    'IBM Data and AI'         : 'https://medium.com/ibm-data-ai/archive/{0}/{1:02d}/{2:02d}',
    'Data Science Brigade'    : 'https://medium.com/data-science-brigade/archive/{0}/{1:02d}/{2:02d}',
    'Kaggle Blog'             : 'https://medium.com/kaggle-blog/archive/{0}/{1:02d}/{2:02d}',
    'Data & Society: Points'  : 'https://points.datasociety.net/archive/{0}/{1:02d}/{2:02d}'
}

year = 2020 # year to obtain data
days = 300 # number of random days (input should be < 365)

In [None]:
get_article(year, days, urls)