# Coding for Economists - Advanced Session 1

## 1. Setup Environment

In [None]:
%pip install selenium

In [180]:
import pandas as pd
import re
import time

## 2. HTTP Requests

### 2.1 Request HTML from URL

In [33]:
from urllib.request import urlopen

url = 'https://www.ft.com/'
page = urlopen(url)
type(page)

http.client.HTTPResponse

In [34]:
page_bytes = page.read()
type(page_bytes)

bytes

__Why Use Bytes__:
1. Everything is 0s and 1s
2. Read, write, cache or stream data without having to interpret it
3. Send large files in chunks, resume interrupted downloads
4. Avoid encoding problems

In [43]:
page_bytes[:1000]

b'<!DOCTYPE html><html lang="en-GB" class="no-js core o-typography--loading-sans o-typography--loading-sans-bold o-typography--loading-display o-typography--loading-display-bold" data-o-component="o-typography" style="overflow-x:hidden;background-color:#fff1e5;color:#33302e"><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta name="viewport" content="width=device-width, initial-scale=1"/><title>Financial Times</title><meta name="description" content="News, analysis and opinion from the Financial Times on the latest in markets, economics and politics"/><meta name="robots" content="index,follow,max-snippet:200,max-image-preview:large"/><meta name="google-site-verification" content="4-t8sFaPvpO5FH_Gnw1dkM28CQepjzo8UjjAkdDflTw"/><script type="application/ld+json">{"@context":"http://schema.org","@type":"WebSite","name":"Financial Times","alternateName":"FT.com","url":"https://www.ft.com/"}</script><meta property="fb:pages" content="8860325749"/><meta pr

In [37]:
# Decode the bytes file into a str containing the html
html = page_bytes.decode('utf-8')
type(html)

str

In [153]:
html[:5000]

'<!DOCTYPE html><html lang="en-GB" class="no-js core o-typography--loading-sans o-typography--loading-sans-bold o-typography--loading-display o-typography--loading-display-bold" data-o-component="o-typography" style="overflow-x:hidden;background-color:#fff1e5;color:#33302e"><head><meta charSet="utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><meta name="viewport" content="width=device-width, initial-scale=1"/><title>Financial Times</title><meta name="description" content="News, analysis and opinion from the Financial Times on the latest in markets, economics and politics"/><meta name="robots" content="index,follow,max-snippet:200,max-image-preview:large"/><meta name="google-site-verification" content="4-t8sFaPvpO5FH_Gnw1dkM28CQepjzo8UjjAkdDflTw"/><script type="application/ld+json">{"@context":"http://schema.org","@type":"WebSite","name":"Financial Times","alternateName":"FT.com","url":"https://www.ft.com/"}</script><meta property="fb:pages" content="8860325749"/><meta pro

### 2.2 HTML (HyperText Markup Language) Structure

__HTML Tutorial__: https://www.w3schools.com/html/html_intro.asp

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>Simple HTML Example</title>
</head>
<body>

  <h3>My Simple Page</h3>

  <p>This is a simple paragraph of text on my page.</p>

  <img src="tarrif.avif" alt="Placeholder image">

  <table>
    <tr>
      <th>Name</th>
      <th>Age</th>
    </tr>
    <tr>
      <td>Alice</td>
      <td>30</td>
    </tr>
    <tr>
      <td>Bob</td>
      <td>25</td>
    </tr>
  </table>

</body>
</html>

In [45]:
len(html)

360420

## 3. Parse HTML Using `BeautifulSoup`
Beautifulsoup transforms a complex HTML document into a tree of Python objects.

__Beautifulsoup Tutorial__: https://beautiful-soup-4.readthedocs.io/en/latest/#

### 3.1 Make the Soup

In [47]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [58]:
soup.title

<title>Financial Times</title>

In [70]:
soup.img

<img alt="Spain declares state of emergency in the wake of huge power outage" class="image image--width-280" src="https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfront.net%2Fproduction%2F69f1fc45-c419-4011-888e-9fcc887f18b9.jpg?source=next-home-page&amp;dpr=2&amp;width=280&amp;fit=scale-down"/>

In [73]:
soup.p

<p class="standfirst"><a aria-hidden="false" class="link" data-trackable="standfirst-link" data-trackable-context-story-link="standfirst-link" href="/content/d790af03-8681-432d-88c8-ed7e980fcf85" target="_self"><span class="text text--color-black-60 text-sans--scale-0 text--style--no-active-state" id="">Many face night without electricity after train network, traffic lights and mobile connections shut down by mystery blackout</span></a></p>

### 3.2 Search the Tree `.find_all()`

In [79]:
unique_tags = { tag.name for tag in soup.find_all(True) }
print(unique_tags)

{'source', 'meta', 'small', 'form', 'input', 'title', 'br', 'span', 'picture', 'noscript', 'p', 'html', 'path', 'ul', 'li', 'section', 'time', 'label', 'div', 'a', 'h2', 'pg-slot', 'button', 'video', 'ol', 'link', 'header', 'nav', 'img', 'svg', 'footer', 'head', 'script', 'abbr', 'h1', 'iframe', 'main', 'body', 'h3'}


- __Text Tags__: [`'p'`, `'a'`, `'h1'`, `'h2'`, `'h3'`]
- __Table Tags__: [`'table'`, `'thead'`, `'tbody'`, `'tr'`, `'th'`, `'td'`]
- __Image Tags__: [`'img'`, `'picture'`, `'source'`, `'svg'`]
- __Video Tags__: [`'video'`, `'source'`, `'iframe'`]

#### Find Text from Headers

In [192]:
header_objs = soup.find_all(
    'a',
    class_='o-header__mega-link',
    href=re.compile('content'),
    attrs={'data-trackable': 'link'}
)
headers = [obj.get_text(strip=True) for obj in header_objs]
headers = list(set(headers))
headers[:20]

['Brands target AI chatbots as users switch from Google search',
 'Free edibles: foraging in London with a top chef',
 'China stockpiles oil as Trump tariff shock hits crude prices',
 'LNG companies say they cannot comply with Trump rules on Chinese ships',
 'Give me fractal fronds over Kindleberger’s Spiral any day',
 'Tech groups pivot to defence in neutral Ireland as EU re-arms',
 'In charts: How Britain’s banking revolution failed',
 'China and South Korea extend battery battle from EVs to grid storage',
 'China says it can live without US farm and energy goods',
 'Demand slump fuelled by Trump tariffs hits US ports and air freight',
 'Lashing out at staff is bad for business',
 'Five gadgets to cut the stress out of parenting',
 'The S&P 500 is still significantly overpriced',
 'In tough times, good policy becomes even more important',
 'Channel 4 chief executive Alex Mahon to step down',
 'Goldman Sachs-backed start-up to buy UK sound studio in bet on AI music-making',
 'Gen Z is

In [204]:
print(header_objs[0].prettify())

<a class="o-header__mega-link" data-trackable="link" href="/content/bef8f66e-b377-41b9-8617-5ffe4bb98af7">
 Trump tells Canada to vote for ‘51st state’ in election day intervention
</a>



In [195]:
header_objs = soup.find_all(
    'a',
    class_='o-header__mega-link',
    href=re.compile('content'),
    attrs={'data-trackable': 'link'},
    string=re.compile('Trump')
)
headers = [obj.get_text(strip=True) for obj in header_objs]
headers = list(set(headers))
headers[:20]

['Demand slump fuelled by Trump tariffs hits US ports and air freight',
 'American students turn to UK as Trump takes aim at US universities',
 'China stockpiles oil as Trump tariff shock hits crude prices',
 'LNG companies say they cannot comply with Trump rules on Chinese ships',
 'Trump tells Canada to vote for ‘51st state’ in election day intervention']

#### Find Text from Paragraphs

In [181]:
para_objs = soup.find_all('p')
texts = [obj.get_text(strip=True) for obj in para_objs]
texts[:20]

['Many face night without electricity after train network, traffic lights and mobile connections shut down by mystery blackout',
 'Top policymakers seek to reassure public that economy can withstand worst of trade war',
 'Officials are worried US president will use minor progress in talks as ‘excuse’ to disengage',
 'Searches for British courses surge following US president’s assault on Ivy League institutions, data shows',
 'US president weighs in as battle between Mark Carney and Pierre Poilievre goes down to the wire',
 'US capital in upheaval as government lay-offs pummel local economy and Maga allies embed in social scene',
 'Data shows younger staff — more than baby boomers — crave the connection and routine of in-person work',
 'Board’s vote is only a first step to ending tyre company’s links with Chinese state-owned group',
 'Recommendation comes as another difficult vote on CEO’s remuneration package looms',
 'News show criticises parent group seeking Trump administration appr

In [205]:
print(para_objs[0].prettify())

<p class="standfirst">
 <a aria-hidden="false" class="link" data-trackable="standfirst-link" data-trackable-context-story-link="standfirst-link" href="/content/d790af03-8681-432d-88c8-ed7e980fcf85" target="_self">
  <span class="text text--color-black-60 text-sans--scale-0 text--style--no-active-state" id="">
   Many face night without electricity after train network, traffic lights and mobile connections shut down by mystery blackout
  </span>
 </a>
</p>



#### Find Images

In [258]:
img_objs = soup.find_all('img')
img_title = [obj.get('alt') for obj in img_objs]
img_title[:20]

['Mark Carney’s Liberals win pivotal Canadian election',
 'The 10 charts that define Trump’s tumultuous first 100 days',
 'How Trump’s honeymoon turned sour so quickly ',
 'Spain and Portugal race to restore vital infrastructure after massive blackout',
 'China’s copper supplies set to run out as US tariffs bite, says Mercuria',
 'Malta ‘golden passport’ scheme breaks law, EU’s top court rules',
 'KKR and Capital Group seek to lure investors to private markets with new funds',
 'Iranians face a sinking feeling as ground under Tehran cracks open',
 'A&O Shearman wrestles with Trump and culture one year post-merger ',
 'Chinese carmakers reset European ambitions as EU tariffs bite',
 'HSBC’s Elhedery backs US dollar despite worsening economic outlook due to tariffs',
 'How Apollo escaped Germany’s private equity trap',
 'Europe fears Trump preparing to walk away from Ukraine talks',
 'Europe’s battery makers seek a different path after Northvolt’s collapse ',
 'BP profits halve as oil ma

In [198]:
img_url = [obj.get('src') for obj in img_objs]
img_url[:20]

['https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfront.net%2Fproduction%2F69f1fc45-c419-4011-888e-9fcc887f18b9.jpg?source=next-home-page&dpr=2&width=280&fit=scale-down',
 'https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfront.net%2Fproduction%2F6bffbc17-cbae-4fc5-aae9-b1f410c325ea.jpg?source=next-home-page&dpr=2&width=580&fit=scale-down',
 'https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfront.net%2Fproduction%2Fae4d89c9-685c-4e76-9457-7ed9b8bafaad.jpg?source=next-home-page&dpr=2&width=180&fit=scale-down',
 'https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfront.net%2Fproduction%2Fuploaded-files%2Fgideon_rachman_-0dfc38c0-901b-44b6-8ce3-31ab97af4d41.png?source=next-home-page&dpr=2&width=40&height=40&fit=cover&gravity=poi',
 'https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfr

In [206]:
print(img_objs[0].prettify())

<img alt="Spain declares state of emergency in the wake of huge power outage" class="image image--width-280" src="https://www.ft.com/__origami/service/image/v2/images/raw/https%3A%2F%2Fd1e00ek4ebabms.cloudfront.net%2Fproduction%2F69f1fc45-c419-4011-888e-9fcc887f18b9.jpg?source=next-home-page&amp;dpr=2&amp;width=280&amp;fit=scale-down"/>



## 4. Control Webpage Using `selenium`

### 4.1 Connect Website
#### Initiate Browser

In [320]:
import selenium.webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver = selenium.webdriver.Chrome()
driver.get('http://www.ft.com')

#### Accept Cookies

In [321]:
iframes = driver.find_elements(By.TAG_NAME, 'iframe')
print(f"Found {len(iframes)} iframes.")

for idx, iframe in enumerate(iframes):
    print(f"Iframe {idx}: {iframe.get_attribute('outerHTML')[:200]}...")

Found 3 iframes.
Iframe 0: <iframe name="__tcfapiLocator" title="__tcfapiLocator" style="display: none;"></iframe>...
Iframe 1: <iframe name="__gppLocator" style="display: none;"></iframe>...
Iframe 2: <iframe src="https://consent-manager.ft.com/index.html?hasCsp=true&amp;message_id=1274423&amp;consentUUID=null&amp;consent_origin=https%3A%2F%2Fconsent-manager.ft.com%2Fconsent%2Ftcfv2&amp;preload_mes...


In [322]:
driver.switch_to.frame(iframes[2])
accept_btn = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept Cookies')]"))
)
accept_btn.click()

#### Scroll Down to Show the Entire Page

In [270]:
# def ScrollPage(ScrollNumber = 5, ScrollSleep = 1):
#     for i in range(1,ScrollNumber):
#         driver.execute_script("window.scrollTo(1,50000)")
#         time.sleep(ScrollSleep)

# ScrollPage()

### 4.2 Navigate the Page

#### Click on A Link

In [286]:
# List the First 20 Headers
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
header_objs = soup.find_all(
    'a',
    class_='o-header__mega-link',
    href=re.compile('content'),
    attrs={"data-trackable": "link"},
    string=re.compile('Trump')
)
headers = [obj.get_text(strip=True) for obj in header_objs]
headers = list(set(headers))
headers[:20]

['How Trump’s honeymoon turned sour so quickly',
 'Europe fears Trump preparing to walk away from Ukraine talks',
 'The 10 charts that define Trump’s tumultuous first 100 days',
 'Amazon pressures suppliers to cut prices to limit Trump tariff shock']

In [287]:
# Click on the First Header
from selenium.webdriver.common.by import By

# if you’ve already navigated to the page…
link = driver.find_element(By.LINK_TEXT, headers[0])
link.click()

In [288]:
url_save = driver.current_url

### 4.3 Fill in Forms

In [290]:
# Sign in FT.com Account
url_login = 'https://accounts.ft.com/login'
driver.get(url_login)

In [294]:
# Enter email address
wait = WebDriverWait(driver, 15)
email_field = wait.until(EC.element_to_be_clickable((By.ID, 'enter-email')))
email_field.clear()
email_field.send_keys('USERNAME')

In [295]:
# Click Next
next_btn = driver.find_element(By.ID, 'enter-email-next')
next_btn.click()

In [296]:
# Enter Passwords
password_field = wait.until(EC.visibility_of_element_located((By.ID, 'enter-password')))
password_field.clear()
password_field.send_keys('PASSWORD')

In [297]:
# click the submit button
login_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
login_btn.click()

### 4.4 Scrape Current Page

In [298]:
driver.get(url_now)

In [300]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
para_objs = soup.find_all('p')
texts = [obj.get_text(strip=True) for obj in para_objs]
texts

['',
 'James PolitiandMyles McCormickin Washington',
 'Published5 hours ago',
 'Donald Trump launched his second presidency in the Capitol rotunda in January riding the political high of his 2024 election victory with a promise to deliver a new “golden age” to Americans.',
 'But 100 days later, after a blizzard of actions to gut the federal government and remake the global economy through sweeping tariffs, Trump is back to being the unpopular and polarising president that he was during most of his first term in office.',
 'At the 100-day mark, he has the lowest approval rating of any president in the past seven decades, according toone poll.',
 'A dissatisfied US public has rapidly soured onTrump’s performance, according to recent polls, amid scepticism of his trade policies, aggressive spending cuts and even some concern that his crackdown on immigration is going too far.',
 'According to the RealClearPoliticspolling average,Trump had an approval rating of 50.5 per cent at the end of 

### 4.5. Polite Request

In [None]:
# Human-like Pauses
import random, time

def human_pause(mean=1.5, std=0.5):
    time.sleep(max(0, random.gauss(mean, std)))

# …after each navigation or click…
human_pause()

In [None]:
# Use exponential back-off on failures
backoff = 1
while True:
    try:
        driver.get(url)
        break
    except TimeoutException:
        time.sleep(backoff)
        backoff = min(backoff * 2, 30)

In [None]:
# Identify yourself
opts = webdriver.ChromeOptions()
opts.add_argument('user-agent=MyBot/1.0 (+https://mydomain.com/bot-info)')

In [304]:
# Close browser when finished
driver.quit()

## 5. Scrape Online Tables
__Tutorial__: https://oxylabs.io/blog/python-scrape-tables

In [307]:
# Request HTML
from urllib.request import Request, urlopen

url = 'https://www.worldometers.info/world-population/population-by-country/'

# Set a fake browser user agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# Build a request with headers
req = Request(url, headers=headers)

page = urlopen(req)
page_bytes = page.read()
html = page_bytes.decode('utf-8')

In [308]:
# Mkae the soup
soup = BeautifulSoup(html, 'html.parser')

# Find the table
table = soup.find("table")

In [313]:
# Extract headers
headers = []
for th in table.find_all('th'):
    headers.append(th.get_text(strip=True))
print(headers)

['#', 'Country (ordependency)', 'Population(2025)', 'YearlyChange', 'NetChange', 'Density(P/Km²)', 'Land Area(Km²)', 'Migrants(net)', 'Fert.Rate', 'MedianAge', 'UrbanPop %', 'WorldShare']


In [316]:
# Extract all rows
rows = []
for tr in table.find_all('tr'):
    cells = tr.find_all('td')
    row = [cell.get_text(strip=True) for cell in cells]
    if row:  # only append non-empty rows
        rows.append(row)
print(rows[:2])

[['1', 'India', '1,463,865,525', '0.89%', '12,929,734', '492', '2,973,190', '−495,753', '1.94', '28.8', '37.1%', '17.78%'], ['2', 'China', '1,416,096,094', '−0.23%', '−3,225,184', '151', '9,388,211', '−268,126', '1.02', '40.1', '67.5%', '17.20%']]


In [319]:
# Build pandas DataFrame
df = pd.DataFrame(rows, columns=headers)
df.head()

Unnamed: 0,#,Country (ordependency),Population(2025),YearlyChange,NetChange,Density(P/Km²),Land Area(Km²),Migrants(net),Fert.Rate,MedianAge,UrbanPop %,WorldShare
0,1,India,1463865525,0.89%,12929734,492,2973190,"−495,753",1.94,28.8,37.1%,17.78%
1,2,China,1416096094,−0.23%,"−3,225,184",151,9388211,"−268,126",1.02,40.1,67.5%,17.20%
2,3,United States,347275807,0.54%,1849236,38,9147420,1230663,1.62,38.5,82.8%,4.22%
3,4,Indonesia,285721236,0.79%,2233305,158,1811570,"−39,509",2.1,30.4,59.6%,3.47%
4,5,Pakistan,255219554,1.57%,3950390,331,770880,"−1,235,336",3.5,20.6,34.4%,3.10%


## 6. Scrape Multi-page Websites

In [324]:
# Request HTML
from urllib.request import Request, urlopen

url = 'https://scholar.google.com/citations?hl=en&vq=bus_economics&view_op=list_hcore&venue=6OFMzPxOGXUJ.2024'

# Set a fake browser user agent
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}

# Build a request with headers
req = Request(url, headers=headers)

page = urlopen(req)
page_bytes = page.read()
html = page_bytes.decode('utf-8')

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv

In [395]:
import selenium.webdriver

url = 'https://scholar.google.com/citations?hl=en&vq=bus_economics&view_op=list_hcore&venue=6OFMzPxOGXUJ.2024'
driver = selenium.webdriver.Chrome()
driver.get(url)

In [396]:
all_data = []

while True:
    # Scrape data
    titles = driver.find_elements(By.CSS_SELECTOR, ".gsc_mpat_ttl a")
    info = driver.find_elements(By.CSS_SELECTOR, ".gs_gray")
    info = info[:-1]
    authors = info[0::2]
    journals = info[1::2]
    citations = driver.find_elements(By.CSS_SELECTOR, ".gsc_mpat_c a")
    years = driver.find_elements(By.CSS_SELECTOR, ".gsc_mpat_y span")
    years = years[1:]
    
    for title, author, journal, citation, year in zip(titles, authors, journals, citations, years):
        all_data.append({
            'title': title.text,
            'author': author.text,
            'journal': journal.text,
            'citation': citation.text,
            'year': year.text,
            'link': title.get_attribute('href')
        })
    # Try to go to the next page
    try:
        button = driver.find_element(By.CSS_SELECTOR, "button.gs_btnPR.gs_in_ib.gs_btn_half.gs_btn_lsb.gs_btn_srt.gsc_pgn_pnx")
        if not button.is_enabled():
            print('Reached last page.')
            break
        button.click()
        time.sleep(2)  # Wait for next page to load
    except:
        print("No more pages.")
        break

Reached last page.


In [397]:
citations[0].text

'164'

In [406]:
# create a DataFrame
df = pd.DataFrame(all_data)
print(df.shape)
df.head()

(153, 6)


Unnamed: 0,title,author,journal,citation,year,link
0,Two-Way Fixed Effects Estimators with Heteroge...,"C de Chaisemartin, X D’Haultfœuille","American Economic Review 110 (9), 2964-2996",3639,2020,https://scholar.google.com/scholar?oi=bibs&clu...
1,"Bartik Instruments: What, When, Why, and How","P Goldsmith-Pinkham, I Sorkin, H Swift","American Economic Review 110 (8), 2586-2624",2027,2020,https://scholar.google.com/scholar?oi=bibs&clu...
2,Macroeconomic Implications of COVID-19: Can Ne...,"V Guerrieri, G Lorenzoni, L Straub, I Werning","American Economic Review 112 (5), 1437-1474",1594,2022,https://scholar.google.com/scholar?oi=bibs&clu...
3,Importing Political Polarization? The Electora...,"D Autor, D Dorn, G Hanson, K Majlesi","American Economic Review 110 (10), 3139-3183",1515,2020,https://scholar.google.com/scholar?oi=bibs&clu...
4,Are Ideas Getting Harder to Find?,"N Bloom, CI Jones, J Van Reenen, M Webb","American Economic Review 110 (4), 1104-1144",1318,2020,https://scholar.google.com/scholar?oi=bibs&clu...


In [419]:
# Close browser when finished
df.to_csv('AER.csv', index='False')
driver.quit()

## 7. Advanced Data I/O

### 7.1 Batch Read/Write

In [420]:
# Read csv in batches
for i, chunk in enumerate(pd.read_csv('AER.csv', chunksize=20)):
    # chunk is a DataFrame of up to chunksize rows
    print(f"Processing chunk {i}, rows {len(chunk)}")
    # … your processing logic here …
    # e.g. transform, filter, write to DB, etc.

Processing chunk 0, rows 20
Processing chunk 1, rows 20
Processing chunk 2, rows 20
Processing chunk 3, rows 20
Processing chunk 4, rows 20
Processing chunk 5, rows 20
Processing chunk 6, rows 20
Processing chunk 7, rows 13


In [421]:
# Write csv in batches
rows_per_file = 50
output_prefix = 'AER_'

for i in range(0, len(df), rows_per_file):
    chunk = df.iloc[i : i + rows_per_file]
    file_idx = i // rows_per_file 
    filename = f"{output_prefix}{file_idx:02d}.csv"
    chunk.to_csv(filename, index=False)
    print(f"Wrote {len(chunk)} rows to {filename}")

Wrote 50 rows to AER_00.csv
Wrote 50 rows to AER_01.csv
Wrote 50 rows to AER_02.csv
Wrote 3 rows to AER_03.csv


### 7.2 Compression

In [437]:
# Compress data into gzip format
df.to_csv('AER.csv.gz', index=False, compression='gzip')

# Compress data into zip format
df.to_csv('AER.csv.zip', index=False, compression='zip')

In [438]:
# Compare sizes
import os

raw_size = os.path.getsize('AER.csv')
gzip_size = os.path.getsize('AER.csv.gz')
zip_size = os.path.getsize('AER.csv.zip')
print(f"Raw file size: {raw_size / (1024**2):.2f} MB")
print(f"Gzip file size: {gzip_size / (1024**2):.2f} MB")
print(f"zip file size: {zip_size / (1024**2):.2f} MB")

Raw file size: 0.03 MB
Gzip file size: 0.01 MB
zip file size: 0.01 MB


In [446]:
# Read compressed file using pandas
df_gz = pd.read_csv('AER.csv.gz', compression='gzip')
df_gz.head()

Unnamed: 0,title,author,journal,citation,year,link
0,Two-Way Fixed Effects Estimators with Heteroge...,"C de Chaisemartin, X D’Haultfœuille","American Economic Review 110 (9), 2964-2996",3639,2020,https://scholar.google.com/scholar?oi=bibs&clu...
1,"Bartik Instruments: What, When, Why, and How","P Goldsmith-Pinkham, I Sorkin, H Swift","American Economic Review 110 (8), 2586-2624",2027,2020,https://scholar.google.com/scholar?oi=bibs&clu...
2,Macroeconomic Implications of COVID-19: Can Ne...,"V Guerrieri, G Lorenzoni, L Straub, I Werning","American Economic Review 112 (5), 1437-1474",1594,2022,https://scholar.google.com/scholar?oi=bibs&clu...
3,Importing Political Polarization? The Electora...,"D Autor, D Dorn, G Hanson, K Majlesi","American Economic Review 110 (10), 3139-3183",1515,2020,https://scholar.google.com/scholar?oi=bibs&clu...
4,Are Ideas Getting Harder to Find?,"N Bloom, CI Jones, J Van Reenen, M Webb","American Economic Review 110 (4), 1104-1144",1318,2020,https://scholar.google.com/scholar?oi=bibs&clu...


### 7.3 Pickle

In [447]:
df.dtypes

title       object
author      object
journal     object
citation    object
year        object
link        object
dtype: object

In [448]:
df_gz.dtypes

title       object
author      object
journal     object
citation     int64
year         int64
link        object
dtype: object

In [449]:
df.equals(df_gz)

False

In [450]:
df.to_pickle('AER.pkl')
df_pickle = pd.read_pickle('AER.pkl')
df.equals(df_pickle)

True

# ------------------------------------------------------

In [None]:
# advanced_session1.py
# Advanced Session 1: Web Scraping & Advanced File Handling
import time
import requests
from urllib.parse import urljoin, urlparse


# 1. HTTP Requests and Scraping Ethics

BASE_URL = "https://example.com"

# Check robots.txt
robots_txt = requests.get(urljoin(BASE_URL, "/robots.txt")).text
print("robots.txt:", robots_txt[:200], "…")

# Polite request with rate-limiting
def polite_get(url, session=None, pause=1.0, **kwargs):
    """Perform GET respecting a pause between requests."""
    if session is None:
        session = requests.Session()
    resp = session.get(url, **kwargs)
    time.sleep(pause)
    resp.raise_for_status()
    return resp

# 2. HTML Parsing with BeautifulSoup
from bs4 import BeautifulSoup

def parse_html(html):
    soup = BeautifulSoup(html, "html.parser")
    # Example: extract all links in a table
    table = soup.find("table", {"id": "data-table"})
    rows = table.find_all("tr")
    data = []
    for tr in rows:
        cols = [td.get_text(strip=True) for td in tr.find_all("td")]
        if cols:
            data.append(cols)
    return data

# 3. Advanced Scraping Techniques
# 3a. JavaScript-rendered content via requests_html
try:
    from requests_html import HTMLSession
    def scrape_js_page(url):
        session = HTMLSession()
        r = session.get(url)
        r.html.render(timeout=20)         # renders JS
        titles = [h.text for h in r.html.find("h2.article-title")]
        session.close()
        return titles
except ImportError:
    print("requests_html not installed; skip JS rendering demo")

# 3b. Sessions, login, cookies
LOGIN_URL = urljoin(BASE_URL, "/login")
DATA_URL  = urljoin(BASE_URL, "/protected/data")

def login_and_scrape(username, password):
    sess = requests.Session()
    # first get login page to grab csrf token if needed
    login_page = sess.get(LOGIN_URL)
    soup = BeautifulSoup(login_page.text, "html.parser")
    csrf = soup.find("input", {"name": "csrf_token"})["value"]
    payload = {"username": username, "password": password, "csrf_token": csrf}
    resp = sess.post(LOGIN_URL, data=payload)
    resp.raise_for_status()
    # now scrape protected page
    data_page = sess.get(DATA_URL)
    return parse_html(data_page.text)

# 4. Cleaning and Structuring Scraped Data
import pandas as pd
def to_dataframe(raw_rows, columns):
    df = pd.DataFrame(raw_rows, columns=columns)
    # clean numeric columns
    for col in df.select_dtypes(include="object"):
        df[col] = (df[col]
                   .str.replace(",", "")
                   .replace("", pd.NA)
                   .astype("Float64"))
    return df

# 5. Advanced File I/O Strategies
# 5a. Reading/writing Excel, Stata, JSON, XML
def demo_file_io(df, base_path="output/data"):
    df.to_csv(f"{base_path}.csv", index=False)
    df.to_excel(f"{base_path}.xlsx", index=False)
    df.to_stata(f"{base_path}.dta", write_index=False)
    df.to_json(f"{base_path}.json", orient="records", lines=True)

# 5b. Binary formats: Parquet, Feather
def save_binary(df, base_path="output/data"):
    df.to_parquet(f"{base_path}.parquet", index=False)
    df.to_feather(f"{base_path}.feather")

# 5c. Chunked processing for large CSVs
def process_large_csv(path, chunksize=100_000):
    reader = pd.read_csv(path, chunksize=chunksize)
    for chunk in reader:
        # example transform
        chunk["log_value"] = np.log(chunk["value"] + 1)
        # append to a master file, or aggregate
        # chunk.to_parquet("output/aggregated.parquet", mode="append")

# 6. Performance and Memory Management
import numpy as np

def optimize_df(df):
    # downcast numeric types
    df_int = df.select_dtypes(include=["int64"])
    df[df_int.columns] = df_int.apply(pd.to_numeric, downcast="unsigned")
    # convert low-cardinality object columns to categorical
    for col in df.select_dtypes(include=["object"]):
        if df[col].nunique() / len(df) < 0.5:
            df[col] = df[col].astype("category")
    return df

# 7. Mini-Project: Scrape CPI table, clean, save CSV & Parquet
def scrape_cpi_and_save():
    url = "https://www.bls.gov/cpi/tables/home.htm"  # example link
    resp = polite_get(url, pause=1.5)
    soup = BeautifulSoup(resp.text, "html.parser")
    # find the CPI table—this selector will vary by site
    table = soup.select_one("table.cpi-data")
    headers = [th.get_text(strip=True) for th in table.find("tr").find_all("th")]
    rows = []
    for tr in table.find_all("tr")[1:]:
        rows.append([td.get_text(strip=True) for td in tr.find_all("td")])
    cpi_df = pd.DataFrame(rows, columns=headers)
    # clean numeric columns
    cpi_df = to_dataframe(cpi_df, headers)
    # save out
    cpi_df.to_csv("cpi_data.csv", index=False)
    cpi_df.to_parquet("cpi_data.parquet", index=False)
    print("Saved CPI data to CSV and Parquet.")

if __name__ == "__main__":
    # Quick demo run of the mini-project
    scrape_cpi_and_save()
