# Medium scraper

Script to scrape Medium articles based on tag and date range.  

## Initialization

In [4]:
import re
from datetime import date,timedelta
import time
import csv

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import random

import nltk
from nltk.tokenize import word_tokenize

from selenium import webdriver
from selenium.webdriver.common.by import By


In [23]:
#PARAMETERS 

#Tags to scrape from
TAGS = [
        'data-science'
]

#Date range to scrape data
DATE_FROM = date(2016,1,1) 
DATE_TO = date(2023,12,31)

#time interval for scraping
INTERVAL = 3

# Maximum number of articles to scrape per day
MAX_ARTICLES_PER_DAY = 30 
MAX_RETRIES = 3

#email address used to sign in to medium (to scrape member-only articles
MEDIUM_EMAIL = 'hiroakiroa@gmail.com'


## Scrape article list

In [5]:
#Initialize
curr_date = DATE_FROM
delta = timedelta(days=1)
df = pd.DataFrame(columns=["url","tag","date"])


with open("article_list.csv", "w") as file:
    file.write("url,tag,date\n")
    
    for tag in TAGS:
        while curr_date <= DATE_TO:
            retry = 0
            response = None
            
            #Scrape list of articles through the tag archive site in Medium
            while (retry < MAX_RETRIES and (response == None or response.status_code != 200)):
                response = requests.get('https://medium.com/tag/' + tag + "/archive/" +curr_date.strftime("%Y/%m/%d"))
                retry += 1
                time.sleep(INTERVAL)
            if response.status_code == 200:
                # Parse the HTML content of the page using BeautifulSoup
                soup = BeautifulSoup(response.content, "html.parser")

                anchor_tags = soup.find_all('a')
                for anchor in anchor_tags:
                    
                    # As of Mar 2024 it seems the article titles (which are hyperlinks) are set as h3. Using this as the tag to find the article urls
                    if (anchor.find('h3') is not None):
                        href = anchor.get('href')
                        href = re.findall("[^\?]+", href)[0]
                        file.write(f'"{href}","{tag}","{curr_date.strftime("%Y-%m-%d")}"\n')
            else:
                print("Failed to retrieve the webpage.")
            curr_date += delta
            print(curr_date, end=" ")
            
    file.close()


KeyboardInterrupt: 

## Scrape articles

### Initialization for scraping

In [None]:
# The following script has additional chrome-related installations done so it can be ran in colab. Alternatively you can skip the whole cell and run the next one (after uncommenting)
%%shell
sudo apt -y update
sudo apt install -y wget curl unzip
wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
dpkg -i libu2f-udev_1.1.4-1_all.deb
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome-stable_current_amd64.deb

wget -N https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/118.0.5993.70/linux64/chromedriver-linux64.zip -P /tmp/
unzip -o /tmp/chromedriver-linux64.zip -d /tmp/
chmod +x /tmp/chromedriver-linux64/chromedriver
mv /tmp/chromedriver-linux64/chromedriver /usr/local/bin/chromedriver
pip install selenium chromedriver_autoinstaller

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
import chromedriver_autoinstaller

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') # this is must
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chromedriver_autoinstaller.install()

driver = webdriver.Chrome(options=chrome_options)

In [26]:
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') #uncomment this if you don't want to see the chrome browser
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=chrome_options)

### Medium sign in
This step is required to scrape member articles. Note that the script uses selinium and browser interactions, which makes it more vulnerable to Medium site changes. 

In [27]:
driver.get("https://medium.com")
driver.implicitly_wait(3)
element = driver.find_element(By.XPATH,"//a[text()='Sign in']")
element.click()
driver.implicitly_wait(0.5)
sign_in_with_email = driver.find_element(By.XPATH,'//div[text()="Sign in with email"]')

# Perform actions on the element (e.g., click)
sign_in_with_email.click()
driver.implicitly_wait(0.5)
email = driver.find_element(By.XPATH,"//input[@aria-label='email']")
email.click()
driver.implicitly_wait(0.5)

email.send_keys(MEDIUM_EMAIL)
driver.implicitly_wait(1)
cont = driver.find_element(By.XPATH,'//button[text()="Continue"]')
cont.click()

The script above should send a login verification email to your email address. Once you receive the email address, copy the URL and paste to the _signin_url_ field below

In [28]:
## 
signin_url = 'https://medium.com/m/callback/email?token=8fd270cbeefc&operation=login&state=medium&source=email-bcc399e9fcdc-1711599444911-auth.login------0-------------------bc2c7ce5_0bad_41d6_9f91_4d07ebfb9102'
driver.get(signin_url)


### Scrape articles

In [29]:
article_list_df = pd.read_csv('article_list.csv')
# If you don't want to scrape all the articles in the article_list, filter the days field below. 
# Note that the date field is stored as a string in the format of  YYYY-mm-DD
article_list_df = article_list_df.loc[(article_list_df['date'] >= ('2023-09'))] 


date_tag_df = pd.concat([article_list_df['date'].str.slice(0, 7),article_list_df[['tag']]],axis=1).drop_duplicates().reset_index(drop=True)
date_tag_df = date_tag_df.rename(columns={'date':'year-month'})

#Creating separate file per year-month + tag pair
for index,date_tag_row in date_tag_df.iterrows():
    with open(f"articles_{date_tag_row['year-month']}_{date_tag_row['tag']}.csv", "w",encoding="utf-8") as file:
        writer = csv.writer(file, dialect='excel')
        writer.writerow(['url','tag','date','content'])

        #Get list of scrapable articles filtered by year-month + tag
        month_article_df = article_list_df.loc[(article_list_df['date'].str.contains(date_tag_row['year-month'])) & (article_list_df['tag'] == date_tag_row['tag']),:]
        for day in month_article_df['date'].unique():
            print(day, end=" ")
            # If number of articles is more than the MAX_ARTICLES_PER_DAY, sample
            if month_article_df.loc[month_article_df['date']==day,:].shape[0] > MAX_ARTICLES_PER_DAY:
              sampled_urls = month_article_df.loc[month_article_df['date']==day,:].sample(n=MAX_ARTICLES_PER_DAY,replace=False,random_state=0)
            else:
              sampled_urls = month_article_df.loc[article_list_df['date']==day,:]
            for index, row in sampled_urls.iterrows():
                url= row['url']
                tag= row['tag']
                article_date = row['date']
                content = ""
                retry = 0
                response = None
                try:
                    driver.get(url)
                    time.sleep(INTERVAL)
    
                except Exception as e:
                        print("An error occurred:", e)
                        continue
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")
    
                article = soup.find('article')
                if article is not None:
                    
                    # The script assumes all the texts with <p> tag are the article's main text. This can definitely be improved. 
                    for para in article.find_all("p"): 
                        content += para.get_text() + chr(10)
                    writer.writerow([url,tag,article_date,content])
    file.close()
            
    

2023-09-01 2023-09-02 2023-09-03 2023-09-04 2023-09-05 2023-09-06 2023-09-07 2023-09-08 2023-09-09 2023-09-10 2023-09-11 2023-09-12 2023-09-13 2023-09-14 2023-09-15 2023-09-16 2023-09-17 2023-09-18 2023-09-19 2023-09-20 2023-09-21 2023-09-22 2023-09-23 2023-09-24 2023-09-25 2023-09-26 2023-09-27 2023-09-28 2023-09-29 2023-09-30 2023-10-01 2023-10-02 2023-10-03 2023-10-04 2023-10-05 2023-10-06 2023-10-07 2023-10-08 2023-10-09 2023-10-10 2023-10-11 2023-10-12 2023-10-13 2023-10-14 2023-10-15 2023-10-16 2023-10-17 2023-10-18 2023-10-19 2023-10-20 2023-10-21 2023-10-22 2023-10-23 2023-10-24 2023-10-25 2023-10-26 2023-10-27 2023-10-28 2023-10-29 2023-10-30 2023-10-31 2023-11-01 2023-11-02 2023-11-03 2023-11-04 2023-11-05 2023-11-06 2023-11-07 2023-11-08 2023-11-09 2023-11-10 2023-11-11 2023-11-12 2023-11-13 2023-11-14 2023-11-15 2023-11-16 2023-11-17 2023-11-18 2023-11-19 2023-11-20 2023-11-21 2023-11-22 2023-11-23 2023-11-24 2023-11-25 2023-11-26 2023-11-27 2023-11-28 2023-11-29 2023-11-30