# First step : Data web scraping & data collecting

### Import used librairies

In [248]:
from bs4 import BeautifulSoup as bs 
import requests
import pandas as pd
import re
import numpy as np

In [3]:
# scraping of the nb citations table

In [4]:
url_nb_cit = 'https://ideas.repec.org/top/top.person.nbcites.html'

r = requests.get(url_nb_cit)
webpage = bs(r.content)

In [5]:
table = webpage.select('table.shorttop')[0]

In [6]:
#print(table.prettify())

In [7]:
headers = table.select('thead th')

In [8]:
col_names = [header.get_text() for header in headers]

In [9]:
col_names

['Rank', 'Author', 'Score']

In [10]:
t_rows = table.select('tr')[1:]

In [11]:
l = []
for tr in t_rows:
    tds = tr.find_all('td')
    row = [str(td.get_text()).strip() for td in tds]
    l.append(row)

In [12]:
df_nb_cit = pd.DataFrame(l, columns=col_names)

In [13]:
df_nb_cit.tail()

Unnamed: 0,Rank,Author,Score
2939,2938,Ulrich Schmidt Institut für Volkswirtschaftsl...,921
2940,2941,"Peter Orazem Department of Economics, Iowa St...",920
2941,2941,"Vivian Zhanwei Yue Department of Economics, Em...",920
2942,2941,"Augustin Kwasi Fosu Institute of Statistical, ...",920
2943,2941,Ali Acaravci İktisadi ve İdari Bilimler Fakül...,920


In [14]:
df_nb_cit['Author'][10]

'Ross  Levine National Bureau of Economic Research (NBER), Cambridge, Massachusetts (USA)'

In [15]:
# scraping of the full data table 

variables I am interested in :
* **nbWorks**
* **NbCities**
* **NbPages** (Number of Journal Pages)
* **Downloads** (Number of Downloads through RePEc serices over the past 12 months)
* **Students** (Record of graduates)
* **h index** (un h-index de 6 signifie que 6 publications de l’auteur ont chacune été citées au moins 6 fois.)

In [16]:
# scraping the detailed data table seems complicated
# TODO: better scrappe individual tables for each variables

In [17]:
base_url = 'https://ideas.repec.org/top/'

In [18]:
r_detail = requests.get('https://ideas.repec.org/top/top.person.alldetail.html')
wp_detail = bs(r_detail.content)

In [24]:
table_detail = wp_detail.select('table.newbigtop#right-table a')
my_links = [detail['href'] for detail in table_detail]
my_links = set(my_links)
my_links
#lists of the links i am only interested in
my_wanted_links = ['top.person.downloads.html','top.person.nbworks.html','top.person.students.html',
                'top.person.nbpages.html', 'top.person.hindex.html']
#my_wanted_links = [link for link in my_links if link in links_wanted]
#my_wanted_links

In [25]:
table_titles = []
table_rows = []
for link in my_wanted_links:
    bs_col_names = []
    # getting bs element for each page
    full_url = base_url + link
    #print(full_url)
    #break
    page = requests.get(full_url)
    bs_page = bs(page.content)
    # capture the title of the table so we can rename correctly the columns of our dataframe
    bs_title = bs_page.select('head title')[0].get_text()
    table_titles.append(bs_title)
    bs_table = bs_page.select('table.shorttop')[0]
    bs_headers = bs_table.select('thead th')
    bs_col_names = [bs_header.get_text() for bs_header in bs_headers]
    bs_t_rows = bs_table.select(' tr')[1:]
    y = []
    for tr in bs_t_rows:
        bs_tds = tr.find_all('td')
        row =[str(td.get_text()).strip() for td in bs_tds]
        y.append(row)
    # i append each table scraped in a list to create the dataframes
    table_rows.append(y)

In [26]:
# small verification
print(table_rows[0][0])
print(len(table_rows))

['1', 'Christopher F Baum Department of Economics, Boston College, Chestnut Hill, Massachusetts (USA)', '7515']
5


In [27]:
bs_col_students = bs_col_names[:]
bs_col_students.append('Students')

df_nbpages = pd.DataFrame(table_rows[3], columns = bs_col_names)
df_students = pd.DataFrame(table_rows[2], columns = bs_col_students)
df_nbworks = pd.DataFrame(table_rows[1], columns = bs_col_names)
df_downloads = pd.DataFrame(table_rows[0], columns = bs_col_names)
df_hindex = pd.DataFrame(table_rows[4], columns = bs_col_names)

In [28]:
df_students.head()

Unnamed: 0,Rank,Author,Score,Students
0,1,"Eric S. Maskin Department of Economics, Harvar...",4.38,45.5
1,2,Olivier J Blanchard Peter G. Peterson Institut...,5.76,35.0
2,3,"Lawrence F. Katz Department of Economics, Harv...",6.42,43.0
3,4,Rudiger Dornbusch †,7.4,15.0
4,5,"Robert M. Solow Economics Department, Massachu...",8.61,11.0


In [29]:
df_nbpages.head()

Unnamed: 0,Rank,Author,Score
0,1,"Peter Nijkamp Afdeling Ruimtelijke Economie, ...",6967
1,2,Peter C. B. Phillips Cowles Foundation for Res...,6617
2,3,Hans-Werner Sinn ifo Institut - Leibniz-Insti...,6365
3,4,"Rangan Gupta Department of Economics, Faculty...",6337
4,5,"Mohsen Bahmani-Oskooee Economics Department, ...",6142


In [30]:
df_downloads.head()

Unnamed: 0,Rank,Author,Score
0,1,"Christopher F Baum Department of Economics, Bo...",7515
1,2,"Jeffrey Marc Wooldridge Economics Department, ...",7462
2,3,Ben Jann,6565
3,4,"Daron Acemoglu Economics Department, Massachu...",6537
4,5,Nicholas Cox,6424


In [31]:
df_hindex.tail()

Unnamed: 0,Rank,Author,Score
3421,2906,"Alessandro Pavan Department of Economics, Nor...",14
3422,2906,"Zheng Liu Economic Research, Federal Reserve ...",14
3423,2906,Gerhard Sorger Institut für Volkswirtschaftsl...,14
3424,2906,Tatsuyoshi Saijo School of Economics and Mana...,14
3425,2906,"Giordano Mion Department of Economics, Sussex...",14


In [32]:
# scraping of economists ranked by twitter followers

In [33]:
url_twitter = 'https://ideas.repec.org/top/top.person.twitter.html'

r_twitter = requests.get(url_twitter)
wp_twitter = bs(r_twitter.content)

In [34]:
table_twitter = wp_twitter.select('table.shorttop')[0]

In [35]:
headers_twitter = table_twitter.select('thead th')

tw_col_names = [header.get_text() for header in headers_twitter]

In [36]:
tw_rows = table_twitter.select('tr')[1:]

j = []
for tr in tw_rows:
    tds = tr.find_all('td')
    row = [str(td.get_text()).strip() for td in tds]
    j.append(row)

In [37]:
df_twitter = pd.DataFrame(j, columns = tw_col_names)

In [38]:
df_twitter

Unnamed: 0,Rank,Author,Followers
0,1.,"Krugman, Paul R.",4624769
1,2.,"Sala-i-Martin, Xavier",501086
2,3.,"Stiglitz, Joseph E.",316413
3,4.,"Gaviria, Alejandro",302906
4,5.,"Roser, Max",209855
...,...,...,...
470,471.,"Stephenson, Judy Z.",3192
471,472.,"Doerner, William M.",3189
472,473.,"Overman, Henry G.",3178
473,474.,"Feigenbaum, James J.",3172


In [39]:
krug = df_nb_cit[df_nb_cit.Author.str.contains('Krugman')]
krug

Unnamed: 0,Rank,Author,Score
31,32,Paul R. Krugman Woodrow Wilson School of Publi...,16124


In [40]:
df_nb_cit.head()

Unnamed: 0,Rank,Author,Score
0,1,"Andrei Shleifer Department of Economics, Harv...",48826
1,2,"James J. Heckman Department of Economics, Univ...",34146
2,3,"Robert J. Barro Department of Economics, Harva...",31421
3,4,"Robert F. Engle IIIFinance Department, Stern S...",28370
4,5,"Eugene F. Fama Sr.Booth School of Business, Un...",27760


In [41]:
# checking how to names are differently displayed in each data frame so we can then murge them
heck = df_twitter[df_twitter.Author.str.contains('Heckman')]
heck

Unnamed: 0,Rank,Author,Followers
115,116.0,"Heckman, James J.",21639


### Small recap of the data scrapped

In [42]:
df_nb_cit.head()

Unnamed: 0,Rank,Author,Score
0,1,"Andrei Shleifer Department of Economics, Harv...",48826
1,2,"James J. Heckman Department of Economics, Univ...",34146
2,3,"Robert J. Barro Department of Economics, Harva...",31421
3,4,"Robert F. Engle IIIFinance Department, Stern S...",28370
4,5,"Eugene F. Fama Sr.Booth School of Business, Un...",27760


In [43]:
df_twitter.head()

Unnamed: 0,Rank,Author,Followers
0,1.0,"Krugman, Paul R.",4624769
1,2.0,"Sala-i-Martin, Xavier",501086
2,3.0,"Stiglitz, Joseph E.",316413
3,4.0,"Gaviria, Alejandro",302906
4,5.0,"Roser, Max",209855


In [44]:
# TODO: have a list of all the nobel prize winners so we can label our economists with the right category
# make sure the name are spelled identically

### Scraping a list of Nobel prize winners

In [45]:
url_nobel = 'https://ideas.repec.org/nobel.html'
page_nobel = requests.get(url_nobel)
bs_page_nobel = bs(page_nobel.content)

In [46]:
bs_nobel_winners = bs_page_nobel.select('div#content-block a')[4:]
nobel_winners_raw = [econ.get_text() for econ in bs_nobel_winners]
# get the '\n' charachters out of the names
nobel_winners = [x.replace('\n',' ') for x in nobel_winners_raw]
nobel_winners = [x.title() for x in nobel_winners]

### Checking how many nobel prize winners in data

* checking in the twitter data

In [47]:
df_twitter.head()

Unnamed: 0,Rank,Author,Followers
0,1.0,"Krugman, Paul R.",4624769
1,2.0,"Sala-i-Martin, Xavier",501086
2,3.0,"Stiglitz, Joseph E.",316413
3,4.0,"Gaviria, Alejandro",302906
4,5.0,"Roser, Max",209855


In [48]:
sum(df_twitter['Author'].isin(nobel_winners))
# we need to remove the commas from the author names

0

In [49]:
df_tw_test = df_twitter.copy()

In [50]:
df_tw_test['Author'] = df_tw_test['Author'].str.replace(',', '')

In [51]:
sum(df_tw_test['Author'].isin(nobel_winners))
# need to put the first name in front in the twitter dataframe
tw_author = list(df_tw_test['Author'].str.split(' ', 1))
tw_author_reversed = [list(reversed(x)) for x in tw_author]
tw_list_authors = [authors[0] +' ' + authors[1] for authors in tw_author_reversed]


In [52]:
df_tw_test['Author'] = tw_list_authors

In [53]:
# finally we test how many of nobel prize winner we have in our twitter table
sum(df_tw_test['Author'].isin(nobel_winners))
# for the moment we only find 6 ones, but it is likely that we still have some name-unmatching


6

In [54]:
nobel_winners_test = [x.replace('  ', ' ') for x in nobel_winners]

In [55]:
recon_nobel_winners = list(set(nobel_winners_test).intersection(tw_list_authors))

In [56]:
recon_nobel_winners # the nobel prize that are well recongnized
# 6 seems to less to keep this variable ('twitter followers')

['A. Michael Spence',
 'Robert J. Shiller',
 'Lars Peter Hansen',
 'Joseph E. Stiglitz',
 'Jean Tirole',
 'James J. Heckman']

In [61]:
print(df_hindex.shape)
print(df_downloads.shape)
print(df_nb_cit.shape)
print(df_nbpages.shape)
print(df_nbworks.shape)
print(df_students.shape)
# check number of nobel prize winners in df_nb_cit

(3426, 3)
(2954, 3)
(2944, 3)
(2958, 3)
(2978, 3)
(2944, 4)


* checking number of nobel prize winners in my nb_cit data
    * first step : clean the Author column of df_nb_cit

In [62]:
df_nb_cit['Author']

0       Andrei  Shleifer Department of Economics, Harv...
1       James J. Heckman Department of Economics, Univ...
2       Robert J. Barro Department of Economics, Harva...
3       Robert F. Engle IIIFinance Department, Stern S...
4       Eugene F. Fama Sr.Booth School of Business, Un...
                              ...                        
2939    Ulrich  Schmidt Institut für Volkswirtschaftsl...
2940    Peter  Orazem Department of Economics, Iowa St...
2941    Vivian Zhanwei Yue Department of Economics, Em...
2942    Augustin Kwasi Fosu Institute of Statistical, ...
2943    Ali  Acaravci İktisadi ve İdari Bilimler Fakül...
Name: Author, Length: 2944, dtype: object

In [82]:
df_nb_cit_cleaning = df_nb_cit.copy()

In [236]:
print(df_nb_cit_cleaning['Author'].iloc[1])
print(df_nb_cit_cleaning['Author'].iloc[1611])

James J. Heckman Department of Economics, University of Chicago, Chicago, Illinois (USA)
Jan Potters https://www.tilburguniversity.edu/research/economics-and-management/graduate-school, School of Economics and Management, Universiteit van Tilburg, Tilburg, Netherlands


In [85]:
df_nb_cit_cleaning['Author'] = df_nb_cit_cleaning['Author'].apply(lambda x: x.replace('  ', ' '))
df_nb_cit_cleaning['Author_clean'] = df_nb_cit_cleaning['Author'].apply(lambda x: x.split(' ',3)[0:3] if '.' in x else x.split(' ',2)[0:2])

In [93]:
df_nb_cit_cleaning['Author_clean'] = df_nb_cit_cleaning['Author_clean'].apply(lambda x: ' '.join(x))

In [94]:
df_nb_cit_cleaning

Unnamed: 0,Rank,Author,Score,Author_clean
0,1,"Andrei Shleifer Department of Economics, Harva...",48826,Andrei Shleifer
1,2,"James J. Heckman Department of Economics, Univ...",34146,James J. Heckman
2,3,"Robert J. Barro Department of Economics, Harva...",31421,Robert J. Barro
3,4,"Robert F. Engle IIIFinance Department, Stern S...",28370,Robert F. Engle
4,5,"Eugene F. Fama Sr.Booth School of Business, Un...",27760,Eugene F. Fama
...,...,...,...,...
2939,2938,Ulrich Schmidt Institut für Volkswirtschaftsle...,921,Ulrich Schmidt
2940,2941,"Peter Orazem Department of Economics, Iowa Sta...",920,Peter Orazem
2941,2941,"Vivian Zhanwei Yue Department of Economics, Em...",920,Vivian Zhanwei
2942,2941,"Augustin Kwasi Fosu Institute of Statistical, ...",920,Augustin Kwasi


   * checking how many nobel prize winners are in df_nb_cit

In [95]:
sum(df_nb_cit_cleaning['Author_clean'].isin(nobel_winners))

46

In [96]:
df_nb_cit_cleaning[df_nb_cit_cleaning['Author_clean'] == 'Paul R. Krugman']

Unnamed: 0,Rank,Author,Score,Author_clean
31,32,Paul R. Krugman Woodrow Wilson School of Publi...,16124,Paul R. Krugman


In [97]:
# correct Krugman's name in the nobel_winners list
nobel_winners.index('Paul Krugman')

22

In [145]:
nobel_winners_test[22] = 'Paul R. Krugman'

In [149]:
sum(df_nb_cit_cleaning['Author_clean'].isin(nobel_winners_test))

49

In [150]:
nb_cit_recon_nobel_winners = list(set(nobel_winners_test).intersection(df_nb_cit_cleaning['Author_clean'][:2000]))

In [151]:
len(nb_cit_recon_nobel_winners)

48

In [141]:
# there is 48 out of 49 nobel prize winners placed in top 2000 economists

In [143]:
# TODO: get the list of John Bates Clark Medal's winners + John Voneumann's Award winners

#### web scraping of John Bates Clark Medal's winners

In [152]:
url_clark = 'https://en.wikipedia.org/wiki/John_Bates_Clark_Medal'
page_clark = requests.get(url_clark)
bs_p_clark = bs(page_clark.content)

In [153]:
table_clark = bs_p_clark.select('table')[0]

In [176]:
header_clark = table_clark.select('th')#[0].get_text().replace('[1]\n', '')
header_clark = [h.get_text().replace('[1]\n', '').replace('\n', '') for h in header_clark]
header_clark

['Year',
 'Medalists',
 'Institution (at time of receipt)',
 'Alma mater (PhD)',
 'Nationality',
 'Nobel Prize']

In [178]:
tw_rows = table_clark.select('tr')[1:]

w = []
for tr in tw_rows:
    tds = tr.find_all('td')
    row = [str(td.get_text()).strip() for td in tds]
    w.append(row)

In [180]:
df_clark = pd.DataFrame(w, columns=header_clark)

In [187]:
# rectify the name of James J. Heckman so it matches when we will concatenate our dataframes
df_clark['Medalists'][df_clark['Medalists'] == 'James Heckman'] = 'James J. Heckman'

In [188]:
df_clark.head()

Unnamed: 0,Year,Medalists,Institution (at time of receipt),Alma mater (PhD),Nationality,Nobel Prize
0,1947,Paul Samuelson,Massachusetts Institute of Technology,Harvard University,United States,1970.0
1,1949,Kenneth E. Boulding,University of Michigan,University of Oxford,United States,
2,1951,Milton Friedman,University of Chicago,Columbia University,United States,1976.0
3,1955,James Tobin,Yale University,Harvard University,United States,1981.0
4,1957,Kenneth Arrow,Stanford University,Columbia University,United States,1972.0


#### web scraping of Jon von Neumann Award winners

In [190]:
url_vn = 'https://en.wikipedia.org/wiki/John_von_Neumann_Award'
page_vn = requests.get(url_vn)
bs_vn = bs(page_vn.content)

In [209]:
table_vn = bs_vn.find_all('table')[2]

In [218]:
header_vn = table_vn.select('th')
header_vn = [h.get_text().replace('\n', '') for h in header_vn]
header_vn

['Year', 'Recipient', 'Institution', 'Nationality']

In [224]:
t_vn_rw = table_vn.select('tr')[1:]
b = []
for tr in t_vn_rw:
    tds = tr.find_all('td')
    row = [str(td.get_text()).strip() for td in tds]
    b.append(row)

In [226]:
df_vn = pd.DataFrame(b, columns=header_vn)

In [227]:
df_vn.head()

Unnamed: 0,Year,Recipient,Institution,Nationality
0,1995,John Harsanyi,UC Berkeley,United States
1,1996,Hal Varian,University of Michigan,United States
2,1997,János Kornai,Harvard University; Collegium Budapest,Hungary
3,1998,Jean Tirole,Toulouse School of Economics,France
4,1999,Oliver E. Williamson [1],UC Berkeley,United States


End point: I've scraped all the data I needed. I am not going the keep the 'twitter-follower' variable (since it captures to few nobel prize winners) but I intend to keep all the other variables. That is to say:
* nb_citations
* nb_work
* nb_pages
* nb_downloads
* nb_students
* h_index
* ivy_league_yn
* american_yn
* von_neumann_yn
* clark_medal_yn
* nobel_winner_yn

#### download in csv files of the data scrapped

In [240]:
# saving nb_cit data
df_nb_cit_cleaning.to_csv('data/nb_cit_cleaning.csv', index=False)

In [239]:
# saving nb_work
df_nbworks.to_csv('data/nb_works.csv', index=False)

In [241]:
# saving nb_pages 
df_nbpages.to_csv('data/nb_pages.csv', index=False)

In [245]:
# saving nb_downloads, nb_students, h_index, von_neumann, clark_medal
my_dfs = [df_downloads, df_students, df_hindex, df_vn, df_clark]
file_names = ['nb_downloads', 'nb_students', 'h_index', 'von_neumann', 'clark_medal']
for df, file in zip(my_dfs, file_names):
    df.to_csv('data/' + file + '.csv', index= False)
    


In [257]:
# creation of a dataframe containing all the nobel_prize winners & saving it
series_nobel_winners = pd.Series(nobel_winners_test)

In [258]:
df_nobel_winners = series_nobel_winners.to_frame()

In [259]:
df_nobel_winners.columns = ['Author']

In [260]:
df_nobel_winners.head()

Unnamed: 0,Author
0,Abhijit Banerjee
1,Esther Duflo
2,Michael Kremer
3,William Nordhaus
4,Paul Romer


In [262]:
 df_nobel_winners.to_csv('data/nobel_prize_winners.csv', index=False)