# Medium scraper

Script to scrape Medium articles based on tag and date range.  

## Initialization

In [8]:
import re
from datetime import date,timedelta
import time
import csv

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import random

import nltk
from nltk.tokenize import word_tokenize

from selenium import webdriver
from selenium.webdriver.common.by import By


In [14]:
#PARAMETERS. Change as needed

#Tags to scrape from
TAGS = [
        'data-science'
]

#Date range to scrape data
DATE_FROM = date(2022,1,1) 
DATE_TO = date(2022,1,31)

#time interval for scraping
INTERVAL = 3

# Maximum number of articles to scrape per day
MAX_ARTICLES_PER_DAY = 30 
MAX_RETRIES = 3

#email address used to sign in to medium (to scrape member-only articles)
MEDIUM_EMAIL = 'hiroakiroa@gmail.com'


### Initialize Selenium

In [10]:
# If you're running this through colab, you can use the following script instead of the next one. 

# %%shell
# sudo apt -y update
# sudo apt install -y wget curl unzip
# wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
# dpkg -i libu2f-udev_1.1.4-1_all.deb
# wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
# dpkg -i google-chrome-stable_current_amd64.deb

# wget -N https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/118.0.5993.70/linux64/chromedriver-linux64.zip -P /tmp/
# unzip -o /tmp/chromedriver-linux64.zip -d /tmp/
# chmod +x /tmp/chromedriver-linux64/chromedriver
# mv /tmp/chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
# pip install selenium chromedriver_autoinstaller

# import sys
# sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

# from selenium import webdriver
# import chromedriver_autoinstaller

# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless') # this is must
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chromedriver_autoinstaller.install()

# driver = webdriver.Chrome(options=chrome_options)


In [11]:
# If you're running on your PC, the following will open a new chrome instance/window unless you add --headless argument

chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') #uncomment this if you don't want to see the chrome browser
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=chrome_options)

## Scrape article list

In [12]:
#Initialize
curr_date = DATE_FROM
delta = timedelta(days=1)
df = pd.DataFrame(columns=["url","tag","date"])


with open("data/article_list.csv", "w") as file:
    file.write("url,tag,date\n")
    for tag in TAGS:
        while curr_date <= DATE_TO:
            print(curr_date, end=" ")
            retry = 0
            response = None
            driver.get('https://medium.com/tag/' + tag + "/archive/" +curr_date.strftime("%Y/%m/%d"))
            time.sleep(INTERVAL)
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            anchor_tags = soup.find_all('a')
            for anchor in anchor_tags:
                # As of Mar 2024 it seems the article titles (which are hyperlinks) are set as h3. Using this as the tag to find the article urls
                if (anchor.find('h3') is not None):
                    href = anchor.get('href')
                    href = re.findall("[^\?]+", href)[0]
                    file.write(f'"{href}","{tag}","{curr_date.strftime("%Y-%m-%d")}"\n')
            curr_date += delta
            
    file.close()


2022-01-01 2022-01-02 2022-01-03 2022-01-04 2022-01-05 2022-01-06 2022-01-07 2022-01-08 2022-01-09 2022-01-10 2022-01-11 2022-01-12 2022-01-13 2022-01-14 2022-01-15 2022-01-16 2022-01-17 2022-01-18 2022-01-19 2022-01-20 2022-01-21 2022-01-22 2022-01-23 2022-01-24 2022-01-25 2022-01-26 2022-01-27 2022-01-28 2022-01-29 2022-01-30 2022-01-31 

## Scrape articles

### Medium sign in
This step is required to scrape member articles. Note that the script interacts with the browser window using  selinium, which makes it more vulnerable to Medium site changes. 

__Make sure you have updated the MEDIUM_EMAIL parameter__

In [21]:
driver.get("https://medium.com")
driver.implicitly_wait(3)
element = driver.find_element(By.XPATH,"//button[text()='Sign in']")
element.click()
driver.implicitly_wait(0.5)
sign_in_with_email = driver.find_element(By.XPATH,'//div[text()="Sign in with email"]')

sign_in_with_email.click()
driver.implicitly_wait(0.5)
email = driver.find_element(By.XPATH,"//input[@aria-label='email']")
email.click()
driver.implicitly_wait(0.5)

email.send_keys(MEDIUM_EMAIL)
driver.implicitly_wait(1)
cont = driver.find_element(By.XPATH,'//button[text()="Continue"]')
cont.click()

The script above should send a login verification email to your email address. Once you receive the email address, copy the URL and paste to the _signin_url_ field below

In [22]:
signin_url = 'https://medium.com/m/callback/email?token=b1209596ac2a&operation=login&state=medium&source=email-bcc399e9fcdc-1713391728621-auth.login------0-------------------3ef35ceb_9902_4689_bce6_da88396ccb03'
driver.get(signin_url)


### Scrape articles

In [23]:
# If you don't want to scrape all the articles in the article_list, filter the days field below. 
# Note that the date field is stored as a string in the format of  YYYY-mm-DD
# article_list_df = article_list_df.loc[(article_list_df['date'] >= ('2023-09'))] 


article_list_df = pd.read_csv('data/article_list.csv')

#Initialize
date_tag_df = pd.concat([article_list_df['date'].str.slice(0, 7),article_list_df[['tag']]],axis=1).drop_duplicates().reset_index(drop=True)
date_tag_df = date_tag_df.rename(columns={'date':'year-month'})

#Creating separate file per year-month + tag pair
for index,date_tag_row in date_tag_df.iterrows():
    with open(f"data/articles_{date_tag_row['year-month']}_{date_tag_row['tag']}.csv", "w",encoding="utf-8") as file:
        writer = csv.writer(file, dialect='excel')
        writer.writerow(['url','tag','date','content'])

        #Get list of scrapable articles filtered by year-month + tag
        month_article_df = article_list_df.loc[(article_list_df['date'].str.contains(date_tag_row['year-month'])) & (article_list_df['tag'] == date_tag_row['tag']),:]
        for day in month_article_df['date'].unique():
            print(day, end=" ")
            # If number of articles is more than the MAX_ARTICLES_PER_DAY, sample
            if month_article_df.loc[month_article_df['date']==day,:].shape[0] > MAX_ARTICLES_PER_DAY:
              sampled_urls = month_article_df.loc[month_article_df['date']==day,:].sample(n=MAX_ARTICLES_PER_DAY,replace=False,random_state=0)
            else:
              sampled_urls = month_article_df.loc[article_list_df['date']==day,:]

            
            for index, row in sampled_urls.iterrows():
                url= row['url']
                tag= row['tag']
                article_date = row['date']
                content = ""
                retry = 0
                response = None
                try:
                    driver.get(url)
                    time.sleep(INTERVAL)
    
                except Exception as e:
                        print("An error occurred, moving to next article", e)
                        continue
                    
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
    
                article = soup.find('article')
                if article is not None:
                    # The script assumes all the texts with <p> tag are the article's main text. This can definitely be improved. 
                    for para in article.find_all("p"): 
                        content += para.get_text() + chr(10)
                    writer.writerow([url,tag,article_date,content])
    file.close()
            
    

2022-01-01 2022-01-02 2022-01-03 2022-01-04 2022-01-05 2022-01-06 2022-01-07 2022-01-08 2022-01-09 2022-01-10 2022-01-11 2022-01-12 2022-01-13 2022-01-14 2022-01-15 2022-01-16 2022-01-17 2022-01-18 2022-01-19 2022-01-20 2022-01-21 2022-01-22 2022-01-23 2022-01-24 2022-01-25 2022-01-26 2022-01-27 2022-01-28 2022-01-29 2022-01-30 2022-01-31 