-
Notifications
You must be signed in to change notification settings - Fork 1
/
search_jobregister_2022.py
246 lines (226 loc) · 11.2 KB
/
search_jobregister_2022.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
from bs4 import BeautifulSoup
import urllib
import re
import numpy as np
import pandas as pd
import pickle as pickle
import glob
# Major changes from 2019 script (8/6/22)
# - Save everything to new .csv and .pkl files with "2022" in file name; refers to
# this new search done in 2022, but job ad data is from June 2019 through May 2022.
# - This illustrates that my coding is weaksauce and someone could easily make
# the script more general.
# - Uh I finally updated my python from 2.x to 3.x, which means:
# - changed calls from e.g. print year, month to print(year,month)
# - Likewise changed cPickle to pickle.
# - urllib.urlopen to urllib.request.urlopen
# - Getting HTTP Error 403: Forbidden error. The URL string is still correct.
# Fix is based on: https://stackoverflow.com/questions/16627227/problem-http-error-403-in-python-3-web-scraping
# - pickle error,TypeError: write() argument must be str, not bytes, "w" to "wb"
# - Encoding character change for the Country field.
#
# Additional changes 8/18/22
# - Add any fields having to do with Pay Compensation.
# They are: salary min, salary max, hourly rate min, hourly rate max, stipend, compensation notes
# Also Benefits
# - Create two versions of the resulting .csv file:
# One with the job text, pay compensation, or benefits.
# One without (usual)
# Example with hourly: https://jobregister.aas.org/ad/8b8e13de
# Example with salary: https://jobregister.aas.org/ad/3af94175
# Example with stipend: view-source:https://jobregister.aas.org/ad/27fc0021
#
# Changes 9/20/23
# Include all of academic year 2022 - through May 2023.
# Save as _2023.csv files
class AppURLopener(urllib.request.FancyURLopener):
version = "Mozilla/5.0"
opener=AppURLopener()
#-------------
# Select if you'd like to do the crawling (get all relevant URLs)
# and/or the scraping (get the contents of those HTML files)
# The crawling produces jobregister_urls_new.txt, which is needed for scraping.
# If you already have jobregister_urls_new.txt, no need to crawl, will just read
# the contents of that file instead. This is included in the github repo but
# of course can be updated.
docrawl=True
# The scraping loops through all the URLs in the aforementioned file and
# grabs the relevant field information. These are saved in .pkl files in
# batches of 100 job postings, because the connection is sometimes lost.
# This allows you to restart at a given point in the list without having to
# redo the entire thing.
doscrape=True
#------
######### GET A LIST OF ALL JOB POSTINGS
if docrawl:
# Modifications from previous: manually input the years, months to collect.
# Though only 12 months appear on https://jobregister.aas.org/jobs/archive,
# the previous links are available. Individual job links seem to work back
# through January 2017; January 2016's links don't work. But June 2016 works,
# and I left off in May 2016, so perfect timing!
yearlist=['2019','2020','2021','2022','2023']
#archive='https://jobregister.aas.org/jobs/archive'
#r=urllib.urlopen(archive).read()
#archivesoup = BeautifulSoup(r)
# To get all links
#monthlist=archivesoup.find_all('a')
#monthlist=[x.get('href') for x in monthlist]
#monthlist=[x for x in monthlist if 'year=' in x]
alllist=[]
for year in yearlist:
if year=='2019':
monthlist=['06','07','08','09','10','11','12']
elif year=='2023':
monthlist=['01','02','03','04','05']
else:
monthlist=['01','02','03','04','05','06','07','08','09','10','11','12']
for month in monthlist:
print(year, month)
print('https://jobregister.aas.org/jobs/archive/'+year+'/'+month)
r=opener.open('https://jobregister.aas.org/jobs/archive/'+year+'/'+month).read()
#r=urllib.request.urlopen('https://jobregister.aas.org/jobs/archive/'+year+'/'+month).read()
monthsoup = BeautifulSoup(r)
jlist=monthsoup.find_all('a')
jlist=[x.get('href') for x in jlist]
#jlist=[x for x in jlist if 'JobID=' in x]
jlist=[x for x in filter(None,jlist) if '/ad/' in x]
alllist.append(jlist)
# Flatten List
alllist=[item for sublist in alllist for item in sublist]
# Remove Duplicates in List
newlist=list(set(alllist))
# Print to a file
with open('jobregister_urls_2023.txt','w') as f:
for item in newlist:
f.write("%s\n" % item)
else:
with open('jobregister_urls_2023.txt') as f:
newlist=f.read().splitlines()
#-------------
# Scraping Setup
# The fields we want from the HTML
fields=['field-name-field-publish-date',
'field-name-field-archive-date',
'field-name-field-application-deadline',
'field-name-field-job-category',
'field-name-field-institution-company',
'field-name-field-institution-classification',
'field-type-text-with-summary', # The main text
'title',
'field-name-field-location-country',
'field-name-field-salary-min',# Add more 8/16/22
'field-name-field-salary-max',
'field-name-field-hourly-rate-min',
'field-name-field-hourly-rate-max',
'field-name-field-stipend',
'field-name-field-benefits',
'field-name-field-location-zip-postal'] # Added 11/27/22
# Shortened dictionary names for the above.
fields_dict=['post','archive','deadline','category','inst','instclass','text','title','country',
'salary-min','salary-max','hourly-min','hourly-max','stipend','benefits','postalcode']
# Start and end strings to find the contents of those fields above.
start=[r'datatype="xsd:dateTime" property="dc:date">',
'datatype="xsd:dateTime" property="dc:date">',
'datatype="xsd:dateTime" property="dc:date">',
'Job Category: </div><div class="field-items"><div class="field-item even">',
'Institution/Company: </div><div class="field-items"><div class="field-item even">',
'Institution Classification/Type: </div><div class="field-items"><div class="field-item even">',
'Job Announcement Text: </div><div class="field-items"><div class="field-item even" property="content:encoded">',
'<h1 class="title" id="page-title">',
#'Country:\xc2\xa0</div><div class="field-items"><div class="field-item even">']
'Country:\xa0</div><div class="field-items"><div class="field-item even">',
'Salary Min:\xa0</div><div class="field-items"><div class="field-item even">',
'Salary Max:\xa0</div><div class="field-items"><div class="field-item even">',
'Hourly Rate Min:\xa0</div><div class="field-items"><div class="field-item even">',
'Hourly Rate Max:\xa0</div><div class="field-items"><div class="field-item even">',
'Stipend:\xa0</div><div class="field-items"><div class="field-item even">',
'Included Benefits:\xa0</div><div class="field-items"><div class="field-item even">',
'Zip/Postal:\xa0</div><div class="field-items"><div class="field-item even">']
end=['</span>','</span>','</span>','</div></div></div>','</div></div></div>',
'</div></div></div>','</div></div></div>','</h1>','</div></div></div>', # Ends w/ country
'</div></div></div>','</div></div></div>','</div></div></div>','</div></div></div>',
'</div></div></div>','</div></div></div>','</div></div></div>']
######### SCRAPE ALL JOB ADS FOR INFO, SAVE PERIODICALLY IN GROUPS OF 100
# Sometimes the connection is lost; you can restart using startind as a multiple of 100
# For example, if the last .pkl file you have in your folder is 'alljobs_04500to04599.pkl',
# that means you should use startind=4600 and rerun to start from there.
if doscrape:
data=[]
startind=0 # Make it a multiple of 100.
for u, url in enumerate(newlist[startind:]): # For each listing in the job register...
r=opener.open('http://jobregister.aas.org/'+url[1:]).read() # open contents
soup = BeautifulSoup(r)
# Create dictionary to fill in
d={}
d['url']=url
d['i']=u+startind # Index corresponding to our list of URLs
for i,f in enumerate(fields):
if f=='field-type-text-with-summary':
try: # I am not sure why this is as such.
s=str(soup.find_all("div",class_=f)[0])
result=s
except:
result=''
elif f=='field-name-field-benefits':
try: # I am not sure why this is as such.
s=str(soup.find_all("div",class_=f)[0])
result=s.partition(start[i])[2].partition(end[i])[0]
except:
result=''
elif f=='title':
s=str(soup.find_all("h1",class_="title")[0])
result=s.split('>')[1].split('<')[0]
else:
try:
s=str(soup.find_all("div",class_=f)[0])
result=re.search('%s(.*)%s' % (start[i],end[i]), s).group(1).strip()
except:
result=''
d[fields_dict[i]]=result
# Add dictionary to our list
data.append(d)
# Output progress and save as we go, because sometimes we lose connection
if u>0 and np.mod(u+startind+1,100)==0 or (u+startind==len(newlist)-1):
print(u+startind+1,'of',len(newlist))
# We're either in a group of 100, or at the end of our list.
if (u+startind!=len(newlist)-1):
ind1=u-99
else:
ind1=u-np.mod(len(newlist)-1,100) # Don't go back a full 99 if just at end of list.
pfile='alljobs2023_'+str(data[ind1]['i']).zfill(5)+'to'+str(u+startind).zfill(5)+'.pkl'
print(pfile)
if (u+startind!=len(newlist)-1): # In retrospect, this is not necessary to split up.
pickle.dump(data[ind1:u+1],open(pfile,"wb"))
else:
pickle.dump(data[ind1:],open(pfile,"wb"))
# Read in and combine all our pickles.
pickles=glob.glob('./alljobs2023_*to*.pkl')
# Put them in order
pickles.sort()
data=[]
for i, f in enumerate(pickles):
tmp=pickle.load(open(f,"rb"))
inds=[x['i'] for x in tmp]
data.append(tmp)
print(i,f,np.min(inds),np.max(inds),len(tmp))
#print len(tmp),len(data),[len(sublist) for sublist in data]
# Flatten
data=[item for sublist in data for item in sublist]
# Make this a pandas dataframe
df=pd.DataFrame(data)
print(len(df),len(set(df.url)),len(set(newlist)))
# Convert posting date from string to datetime.
df.post=pd.to_datetime(df.post)
df['year']=df.post.dt.year
df['month']=df.post.dt.month
# But, what we really want is academic year, to be comparable to e.g. Metcalfe 2007.
# For acadademic year X, want June X - June X+1
# 2003 is June 2003-June 2004. January-May (inclusive) should be previous year.
df['acyear']=df['year']
df['acyear'][df['month']<=5]=df['year'].copy()-1
# Save the contents of this dataframe, except the job text, which is large (file size is ~ 10x as large)
# Also don't save the deadline or archive date. Just the posting year and month.
df.to_csv('./jobregister_table_2023.csv',
columns=['i','year','month','acyear','title','category','inst','instclass','url','country','postalcode'])
# Added 8/16/22 with all the rest of the fields.
df.to_csv('./jobregister_table_2023_extrafields.csv')