In [3]:
import requests
from bs4 import BeautifulSoup

## Downloading the HTML

Requests allows us to send HTTP requests. We'll use request.get() to retrieve a response from a server.

In [4]:
url = 'http://www.fieldexperiments.com/papers/'
page = requests.get(url)

In [5]:
page

<Response [200]>

In [6]:
page

<Response [200]>

In [None]:
page.content

## Parse the response content

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

## Navigating the HTML

In [10]:
soup.title

<title>Field Experiments</title>

In [11]:
soup.title.string

'Field Experiments'

In [12]:
soup.title.text

'Field Experiments'

We can create a list of of certain types of html tags using find_all, (e.g. 'p', 'a', div')

In [None]:
soup.find_all('a')
soup.find_all('div')

## Find element by 'id'

In [None]:
container = soup.find(id='accordion')
container

Let's get a list of all of the elements with CSS class name 'panel'

In [None]:
paperList = container.find_all('div', class_='panel')
paperList

In [None]:
first = paperList[0]
first

In [23]:
title = first.find('a').text
title

"2020: A Summary Of Artefactual Field Experiments On Fieldexperiments.Com: The Who's, What's, Where's, And When's"

In [31]:
authorsList = first.find_all(attrs={'name':'citation_author'})
first_author = authorsList[0]['content']

In [35]:
yearMeta = first.find(attrs={'name':'citation_publication_date'})
year = yearMeta['content']
year

'2020'

In [36]:
d = []
for paper in paperList:
    title = paper.find('a').text
    
    authorsList = paper.find_all(attrs={'name':'citation_author'})
    first_author = authorsList[0]['content']
    
    yearMeta = paper.find(attrs={'name':'citation_publication_date'})
    year = yearMeta['content']
    
    tempDict = dict(
        title=title,
        first_author=first_author,
        year=year
    )
    
    d.append(tempDict)

d

[{'title': "2020: A Summary Of Artefactual Field Experiments On Fieldexperiments.Com: The Who's, What's, Where's, And When's",
  'first_author': 'List John A',
  'year': '2020'},
 {'title': "2020: A Summary Of Framed Field Experiments On Fieldexperiments.Com: The Who's, What's Where's, And When's",
  'first_author': 'List John A',
  'year': '2020'},
 {'title': '2020 Summary Data Of Natural Field Experiments Published On Fieldexperiments.Com',
  'first_author': 'List John A',
  'year': '2020'},
 {'title': '2021 Summary Data Of Artefactual Field Experiments Published On Fieldexperiments.Com',
  'first_author': 'List John A',
  'year': '2022'},
 {'title': '2021 Summary Data Of Natural Field Experiments Published On Fieldexperiments.Com',
  'first_author': 'List John A',
  'year': ''},
 {'title': 'Academic Economists Behaving Badly? A Survey On Three Areas Of Unethical Behavior',
  'first_author': 'Bailey Charles ',
  'year': '2001'},
 {'title': 'Achievement Awards For High School Matricul

## Export to CSV

In [37]:
import pandas as pd

df = pd.DataFrame(d)
df

Unnamed: 0,title,first_author,year
0,2020: A Summary Of Artefactual Field Experimen...,List John A,2020.0
1,2020: A Summary Of Framed Field Experiments On...,List John A,2020.0
2,2020 Summary Data Of Natural Field Experiments...,List John A,2020.0
3,2021 Summary Data Of Artefactual Field Experim...,List John A,2022.0
4,2021 Summary Data Of Natural Field Experiments...,List John A,
5,Academic Economists Behaving Badly? A Survey O...,Bailey Charles,2001.0
6,Achievement Awards For High School Matriculati...,Angrist Joshua D,2003.0
7,Actions And Beliefs: Estimating Distribution-B...,Bellemare Charles,2005.0
8,Active Decisions And Pro-Social Behavior: A Fi...,Goette Lorenz,2007.0
9,A Dollar For Your Thoughts: Feedback-Condition...,Cabral Luis,2015.0


In [38]:
import os

csvFilePath = os.path.join(os.getcwd(), 'fe_scrape.csv')
df.to_csv(csvFilePath, index=False)

In [40]:
d = []
for i in range(1,71):
    url = f'http://www.fieldexperiments.com/papers/?page={i}'
    print(url)
    
    ## run all the code to parse the html further
    
    d.append(tempDict)
    
## create dataframe and export to csv


http://www.fieldexperiments.com/papers/?page=1
http://www.fieldexperiments.com/papers/?page=2
http://www.fieldexperiments.com/papers/?page=3
http://www.fieldexperiments.com/papers/?page=4
http://www.fieldexperiments.com/papers/?page=5
http://www.fieldexperiments.com/papers/?page=6
http://www.fieldexperiments.com/papers/?page=7
http://www.fieldexperiments.com/papers/?page=8
http://www.fieldexperiments.com/papers/?page=9
http://www.fieldexperiments.com/papers/?page=10
http://www.fieldexperiments.com/papers/?page=11
http://www.fieldexperiments.com/papers/?page=12
http://www.fieldexperiments.com/papers/?page=13
http://www.fieldexperiments.com/papers/?page=14
http://www.fieldexperiments.com/papers/?page=15
http://www.fieldexperiments.com/papers/?page=16
http://www.fieldexperiments.com/papers/?page=17
http://www.fieldexperiments.com/papers/?page=18
http://www.fieldexperiments.com/papers/?page=19
http://www.fieldexperiments.com/papers/?page=20
http://www.fieldexperiments.com/papers/?page=21
h