# Census for Academic Jobs

## Setup

In [112]:
# need to install wikia api
!pip install wikia tqdm pandas bs4 -q

In [1]:
import os,sys
import pandas as pd
import wikia
from wikia import WikiaError
from tqdm import tqdm
from datetime import datetime
import bs4

In [2]:
# some constants
WIKI_NAME = 'academicjobs'
MAIN_PAGE_NAME = 'Academic_Jobs_Wiki'

## Find relevant pages

### Get the top level discipline pages

In [3]:
# what year is it
now=datetime.now()
the_year = now.year
the_year

2020

In [4]:
# earliest year on the main page is 2007, but the formatting is standardized around 2011
years = list(range(2011, the_year + 1))
years

[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [5]:
# Change for history?
disc_name = 'Rhetoric/Composition'

In [6]:
# get pages for discipline 
disc_pages=[f'{disc_name} {year}' for year in years]
print(disc_pages,'\n')

for dp in disc_pages:
    print(f'[{dp.split()[-1].split("-")[0]}](https://academicjobs.wikia.org/wiki/{dp.replace(" ","_")})', end=' | ')

['Rhetoric/Composition 2011', 'Rhetoric/Composition 2012', 'Rhetoric/Composition 2013', 'Rhetoric/Composition 2014', 'Rhetoric/Composition 2015', 'Rhetoric/Composition 2016', 'Rhetoric/Composition 2017', 'Rhetoric/Composition 2018', 'Rhetoric/Composition 2019', 'Rhetoric/Composition 2020'] 

[2011](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2011) | [2012](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2012) | [2013](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2013) | [2014](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2014) | [2015](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2015) | [2016](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2016) | [2017](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2017) | [2018](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2018) | [2019](https://academicjobs.wikia.org/wiki/Rhetoric/Composition_2019) | [2020](https://academicjobs.wikia.org/wiki/Rhetoric/Composi

In [9]:
disc_pages

['Rhetoric/Composition 2011',
 'Rhetoric/Composition 2012',
 'Rhetoric/Composition 2013',
 'Rhetoric/Composition 2014',
 'Rhetoric/Composition 2015',
 'Rhetoric/Composition 2016',
 'Rhetoric/Composition 2017',
 'Rhetoric/Composition 2018',
 'Rhetoric/Composition 2019',
 'Rhetoric/Composition 2020']

In [10]:
years

[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [11]:
list(zip(disc_pages,years))

[('Rhetoric/Composition 2011', 2011),
 ('Rhetoric/Composition 2012', 2012),
 ('Rhetoric/Composition 2013', 2013),
 ('Rhetoric/Composition 2014', 2014),
 ('Rhetoric/Composition 2015', 2015),
 ('Rhetoric/Composition 2016', 2016),
 ('Rhetoric/Composition 2017', 2017),
 ('Rhetoric/Composition 2018', 2018),
 ('Rhetoric/Composition 2019', 2019),
 ('Rhetoric/Composition 2020', 2020)]

### Get links from discipline pages

In [12]:
def get_links_from_disc_page(disc_page_name, year):
    # get page from wikia
    page=wikia.page(WIKI_NAME, disc_page_name)
    
    # get html
    html = page.html()
    
    # read html
    dom = bs4.BeautifulSoup(html)
    
    # get links
    links=[]
    
    for link in dom('a'):
        try:
            href=link['href']
        except KeyError:
            continue
        
        #If not a wiki page, skip it
        if not '/wiki/' in href:
            continue
        
        wikilink = href.split('/wiki/')[1]
        
        #If special page queries present, skip it
        if ':' in wikilink or '?' in wikilink:
            continue
        
        if wikilink == disc_page_name.replace(' ','_'):
            continue
        
        if year and (not str(year) in wikilink and not str(year+1) in wikilink) or (str(int(year)-1) in wikilink):
            continue
            
        if not wikilink in links:
            links+=[wikilink]
    
    return links

In [15]:
# Get all links

def strip_year_from_page(page_name):
    pdat=page_name.split('_')
    return ' '.join(pdat[:-1])

def get_all_links():
    link_ld=[]
    for dpage,dyear in tqdm(list(zip(disc_pages,years))):
        links = get_links_from_disc_page(dpage, year=dyear)
        for link in links:
            # Fix year label issue with R/C pages: 2011 year being labeled as 2012, etc.
            if 'Salaries' not in link and 'New_Media' not in link and 'Queer' not in link and 'Environmental' not in link and 'Rhetoric' in link:
                link = link[0:21]+str(dyear)                
                link_d={'disc_page':dpage, 'year':dyear, 'page':link, 'page_group':strip_year_from_page(link)}
                link_ld.append(link_d)
                
            link_d={'disc_page':dpage, 'year':dyear, 'page':link, 'page_group':strip_year_from_page(link)}
            link_ld.append(link_d)
            
    # Manual fix for r/c 2020
    link_ld.append({
        'disc_page': 'Rhetoric/Composition 2020', 
        'year': 2020, 
        'page': 'Rhetoric/Composition_2020', 
        'page_group': 'Rhetoric/Composition'
    })
        
    return link_ld

In [16]:
LINK_LD = get_all_links()
len(LINK_LD)

100%|██████████| 10/10 [00:09<00:00,  1.10it/s]


126

### Clean links

In [26]:
df_pages=pd.DataFrame(LINK_LD)
df_pages

Unnamed: 0,disc_page,year,page,page_group
0,Rhetoric/Composition 2011,2011,Rhetoric/Composition_Positions_and_Salaries_20...,Rhetoric/Composition Positions and Salaries
1,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition
2,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition
3,Rhetoric/Composition 2011,2011,CreativeWriting_2011,CreativeWriting
4,Rhetoric/Composition 2011,2011,Children%27s_2011,Children%27s
...,...,...,...,...
121,Rhetoric/Composition 2020,2020,Comparative_Literature_2020-2021,Comparative Literature
122,Rhetoric/Composition 2020,2020,Humanities_and_Social_Sciences_Postdocs_2020-2021,Humanities and Social Sciences Postdocs
123,Rhetoric/Composition 2020,2020,German_2020-2021,German
124,Rhetoric/Composition 2020,2020,New_Media_and_Digital_Humanities_2020,New Media and Digital Humanities


## Check different years for discrepancies in pages/groups across years

In [27]:
df_pages[df_pages.year == 2019]

Unnamed: 0,disc_page,year,page,page_group
103,Rhetoric/Composition 2019,2019,Spanish_and_Portuguese_2020-2021,Spanish and Portuguese
104,Rhetoric/Composition 2019,2019,Communication_and_Media_Studies_2020-2021,Communication and Media Studies
105,Rhetoric/Composition 2019,2019,Social_Work_2020-2021,Social Work
106,Rhetoric/Composition 2019,2019,Humanities_and_Social_Sciences_Postdocs_2020-2021,Humanities and Social Sciences Postdocs
107,Rhetoric/Composition 2019,2019,German_2020-2021,German
108,Rhetoric/Composition 2019,2019,Philosophy_2020-2021,Philosophy
109,Rhetoric/Composition 2019,2019,Rhetoric/Composition_2019,Rhetoric/Composition
110,Rhetoric/Composition 2019,2019,Rhetoric/Composition_2019,Rhetoric/Composition
111,Rhetoric/Composition 2019,2019,New_Media_and_Digital_Humanities_2019,New Media and Digital Humanities
112,Rhetoric/Composition 2019,2019,Community_Colleges_2019,Community Colleges


## Drop any duplicate pages

In [28]:
len(df_pages)

126

In [29]:
df_pages = df_pages.drop_duplicates('page',keep='last')
print(len(df_pages))

112


In [30]:
# Testing
df_pages[df_pages.page.str.contains('Rhetoric')]

Unnamed: 0,disc_page,year,page,page_group
0,Rhetoric/Composition 2011,2011,Rhetoric/Composition_Positions_and_Salaries_20...,Rhetoric/Composition Positions and Salaries
2,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition
11,Rhetoric/Composition 2012,2012,Rhetoric/Composition_2012,Rhetoric/Composition
12,Rhetoric/Composition 2012,2012,Rhetoric/Composition_Salaries_2012-13,Rhetoric/Composition Salaries
23,Rhetoric/Composition 2013,2013,Rhetoric/Composition_2013,Rhetoric/Composition
26,Rhetoric/Composition 2013,2013,Rhetoric/Composition_Salaries_for_AY_2013-14,Rhetoric/Composition Salaries for AY
33,Rhetoric/Composition 2014,2014,Rhetoric/Composition_2014,Rhetoric/Composition
36,Rhetoric/Composition 2014,2014,Rhetoric/Composition_Salaries_for_AY_2014-15,Rhetoric/Composition Salaries for AY
42,Rhetoric/Composition 2014,2014,Environmental_2014#Christopher_Newport_Univers...,Environmental 2014#Christopher Newport Univers...
43,Rhetoric/Composition 2014,2014,New_Media_and_Digital_Humanities_2014#Clemson_...,New Media and Digital Humanities 2014#Clemson ...


## Print pages for readme

In [31]:
for pg,pgdf in sorted(df_pages.groupby('page_group')):
    print(f'''* {pg.replace("%26",'&').replace('%27',"'")}:''',end=' ')
    yrs=[]
    for dp in sorted(pgdf.page):
        yrs+=[f'''[{dp.split("_")[-1].split("-")[0]}](https://academicjobs.wikia.org/wiki/{dp.replace(" ","_")})''']
    print(' | '.join(yrs))

* 20-21 c. American: [2018](https://academicjobs.wikia.org/wiki/20-21_c._American_2018)
* African American: [2018](https://academicjobs.wikia.org/wiki/African_American_2018)
* American Open: [2015](https://academicjobs.wikia.org/wiki/American_Open_2015) | [2017](https://academicjobs.wikia.org/wiki/American_Open_2017) | [2018](https://academicjobs.wikia.org/wiki/American_Open_2018)
* American Studies: [2012](https://academicjobs.wikia.org/wiki/American_Studies_2012) | [2013](https://academicjobs.wikia.org/wiki/American_Studies_2013) | [2017](https://academicjobs.wikia.org/wiki/American_Studies_2017)
* Anglophone: [2016](https://academicjobs.wikia.org/wiki/Anglophone_2016) | [2017](https://academicjobs.wikia.org/wiki/Anglophone_2017) | [2019](https://academicjobs.wikia.org/wiki/Anglophone_2019)
* British Open: [2016](https://academicjobs.wikia.org/wiki/British_Open_2016)
* Children's: [2011](https://academicjobs.wikia.org/wiki/Children%27s_2011) | [2012](https://academicjobs.wikia.org/wi

## Step 3: Processing pages

In [32]:
def decide_if_school(title,content):
    title=str(title) + ' ' + str(content)
    #if title in not_unis: return 'n'
    if 'College' in title: return 'y'
    if 'Universit' in title: return 'y'
    if 'UC ' in title: return 'y'
    if 'Department ' in title: return 'y'
    if 'Faculty ' in title: return 'y'
    if 'Demographics' in title: return 'n'
    if 'State' in title: return 'y'
    if any([ (w.startswith('(') and w.endswith(')') and w.upper()==w)   for w in title.split()]): return 'y'
    if '<b>' in title: return 'n'
    return ''

In [33]:
def decide_if_tt(title,ad,nowtt):
    if ' TT ' in title: return 'y'
    if 'Lecturer' in title.split() or 'Visiting Assistant Professor' in title: return 'n'
    if nowtt is not None: return 'y' if nowtt else 'n'

    if "Visiting Assistant Professor" in ad: return 'n'
    if "tenure-track" in ad.lower() or "tenure Track" in ad.lower(): return "y"
    if "Assistant Professor" in ad or "Associate Professor" in ad or "Full Professor" in ad: return "y"
    
    return ''

In [34]:
def decide_job_type(IsTT,page_name):
    if IsTT=='y': return 'TT'
    if 'Postdoc' in page_name: return 'Postdoc'
    if IsTT=='n': return 'Non-TT'
    return 'Unknown'

In [35]:
bad_domains = ['bit.ly','fandom','youtube']

def parse_section(section_dom,section_title,now_isTT,page_name):
    section_content=section_dom.text.replace('Edit\n','')
    #print('\n'*5)
    
    from urllib.parse import urlsplit
    
    links = []
    for a in section_dom('a'):
        try:
            href=a['href']
        except KeyError:
            continue
            
        if '/wiki/' in href: continue
        urldat=urlsplit(href)
        if not urldat.path: continue
        #link=urldat.netloc + urldat.path
        link=href
        
        if any([domain in link for domain in bad_domains]): continue
        links+=[link]
    
    if not section_content: return
    
    # save data for this job
    row = {}
    row['section_content'] = section_content.replace('[edit | edit source]','').strip().replace('\n\n','\n').replace('\n\n','\n').replace('\n\n','\n')
    row['section_links'] = ' | '.join(links)
    row['section_title'] = bs4.BeautifulSoup(section_title).text
    row['IsTT'] = decide_if_tt(row['section_title'], row['section_content'], now_isTT)
    row['IsUni'] = decide_if_school(section_title,row['section_content']) # if row['IsTT']!='y' else 'y'
    row['JobType'] = decide_job_type(row['IsTT'], page_name)
    row['JobID'] = row['section_links'] if row['section_links'] else row['section_title']
    return row


def process_page(page_name):
    # get page
    page_name_q = page_name.replace('%26','&').replace('%27',"'")
    page_name_safe = page_name_q.replace('/','_')
    cachefn=f'cache/{page_name_safe}.html'
    if not os.path.exists(cachefn):
        page = wikia.page(WIKI_NAME, page_name_q)
        # get html
        html=page.html()
    else:
#         print('Using cache')
        with open(cachefn) as f: html=f.read()
    
    
    # start data
    data = []
    now_isTT=None
    for section in html.split('mw-headline')[1:]:
        section_title=section.split('</span>')[0].split('">')[-1].strip()
        if 'RECENT ACTIVITY' in section_title: continue
        if 'Demographics' in section_title: continue
        if 'Instructions' in section_title: continue
        if 'Word on the Street' in section_title: continue
        if 'Tenure-Track Positions' in section_title:
            now_isTT=True
            continue
        if 'Visiting Positions' in section_title:
            now_isTT=False
            continue
        if section_title.startswith('Humanities and Social Sciences Postdocs'):
            #now_isTT=False
            continue
        
        if page_name in {'Spanish_and_Portuguese_2020-2021','Film_Studies_2020-2021','French_%26_Francophone_2020-2021'}:
            # these use a different format for no reason!
            for ol in bs4.BeautifulSoup(section)('ol'):
                for p in ol('li'):
                    if '<b>' in str(p):
                        section_title_p=list(p('b'))[0].text
                        row=parse_section(p,section_title_p,None,page_name)
                        data.append(row)
        elif section_title.startswith('Jobs with 2020') or section_title.startswith('Jobs with 2021'):   # 2020 changed format!?
            for p in bs4.BeautifulSoup(section)('p'):
                if '<b>' in str(p):
                    section_title_p=list(p('b'))[0].text
                    row=parse_section(p,section_title_p,None,page_name)
                    data.append(row)
        else:
            section_dom=bs4.BeautifulSoup(section.split('</span>',1)[-1])
            row=parse_section(section_dom,section_title,now_isTT,page_name)
            data.append(row)
    return [d for d in data if d]

## Step 4: Gathering all pages' data

In [36]:
# Get all pages' data!!!!
def get_all_data():
    data_ld=[]
    all_pages=list(df_pages.page)
    for i,page in enumerate(tqdm(sorted(all_pages))):
        try:
            page_data = process_page(page)
        except WikiaError as e:
            continue
        if not page_data: continue
        #datadx={**page_data, **{'page':page}}
        for dx in page_data:
            if not dx: continue
            dx['page']=page
            data_ld.append(dx)
    return pd.DataFrame(data_ld)

In [37]:
# Big data crunching step!
df_data=get_all_data()
df_data

100%|██████████| 112/112 [02:13<00:00,  1.19s/it]


Unnamed: 0,section_content,section_links,section_title,IsTT,IsUni,JobType,JobID,page
0,Adelphi University is seeking applicants for a...,http://www.adelphi.edu/positions/faculty,Adelphi University (NY) - TT Assistant Profess...,y,y,TT,http://www.adelphi.edu/positions/faculty,20-21_c._American_2018
1,Two Tenure-Track Assistant Professor Positions...,https://apply.interfolio.com/45917,Beloit College (WI) - TT Asst. Professor - 20t...,y,y,TT,https://apply.interfolio.com/45917,20-21_c._American_2018
2,Bucknell University’s Department of Women’s an...,http://careers.bucknell.edu/cw/en-us/job/49379...,Bucknell University (PA) - TT Asst. Professor ...,y,y,TT,http://careers.bucknell.edu/cw/en-us/job/49379...,20-21_c._American_2018
3,"The English Department, within Wilkinson Colle...",https://www.indeed.com/viewjob?jk=0988b9d4faed...,Chapman University (CA) - TT Asst. Professor i...,y,y,TT,https://www.indeed.com/viewjob?jk=0988b9d4faed...,20-21_c._American_2018
4,Posting ID: 16027\nCategory: Tenure-track\nDep...,http://www.claremontmckenna.edu/lit/ | https:/...,Claremont McKenna College (CA) - TT Assistant ...,y,y,TT,http://www.claremontmckenna.edu/lit/ | https:/...,20-21_c._American_2018
...,...,...,...,...,...,...,...,...
9309,The Department of Bicultural-Bilingual Studies...,,University of Texas San Antonio,,y,Unknown,University of Texas San Antonio,TESOL_/_Applied_Linguistics_2013
9310,The Department of Teacher Education and Higher...,,University of North Carolina at Greensboro,y,y,TT,University of North Carolina at Greensboro,TESOL_/_Applied_Linguistics_2013
9311,The Department of Language Studies at the Univ...,http://uoft.me/how-to-apply | mailto:langstudi...,University of Toronto Mississauga,y,y,TT,http://uoft.me/how-to-apply | mailto:langstudi...,TESOL_/_Applied_Linguistics_2013
9312,See full post at American Studies 2014\n<span ...,,College of William and Mary - Associate Prof. ...,,y,Unknown,College of William and Mary - Associate Prof. ...,Theory_2014


## Step 5: Postprocessing

In [38]:
df = df_pages.merge(df_data,on='page') #.merge(df_aliases,on='page_group')
df[:5]

Unnamed: 0,disc_page,year,page,page_group,section_content,section_links,section_title,IsTT,IsUni,JobType,JobID
0,Rhetoric/Composition 2011,2011,Rhetoric/Composition_Positions_and_Salaries_20...,Rhetoric/Composition Positions and Salaries,"Assistant Professor, no tenure institution, So...",,Rhet/Comp Positions and Salaries 2011-12,y,,TT,Rhet/Comp Positions and Salaries 2011-12
1,Rhetoric/Composition 2011,2011,Rhetoric/Composition_Positions_and_Salaries_20...,Rhetoric/Composition Positions and Salaries,No one seems to be including gender in his/her...,https://www.wikia.org/ | https://www.wikia.org...,Comments and Questions,,,Unknown,https://www.wikia.org/ | https://www.wikia.org...
2,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,"<span class=""",,Rhetoric/Composition Jobs 2011,,n,Unknown,Rhetoric/Composition Jobs 2011
3,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,Link - first posting on page\nSubfield/descrip...,http://www.acu.edu/academics/provost/positions...,"Abilene Christian University (Abilene, TX)",y,y,TT,http://www.acu.edu/academics/provost/positions...
4,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,"LINK\n""Alfred State College invites applicatio...",http://www.higheredjobs.com/faculty/details.cf...,Alfred State College (NY),y,y,TT,http://www.higheredjobs.com/faculty/details.cf...


### Sanity checks

In [39]:
# df[(df.page_group=='Rhetoric/Composition') & (df.year==2020)]
df.page_group

0       Rhetoric/Composition Positions and Salaries
1       Rhetoric/Composition Positions and Salaries
2                              Rhetoric/Composition
3                              Rhetoric/Composition
4                              Rhetoric/Composition
                           ...                     
9309                           Rhetoric/Composition
9310                           Rhetoric/Composition
9311                           Rhetoric/Composition
9312                           Rhetoric/Composition
9313                           Rhetoric/Composition
Name: page_group, Length: 9314, dtype: object

In [40]:
df = df[~df.page_group.str.contains('Salaries')]
df

Unnamed: 0,disc_page,year,page,page_group,section_content,section_links,section_title,IsTT,IsUni,JobType,JobID
2,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,"<span class=""",,Rhetoric/Composition Jobs 2011,,n,Unknown,Rhetoric/Composition Jobs 2011
3,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,Link - first posting on page\nSubfield/descrip...,http://www.acu.edu/academics/provost/positions...,"Abilene Christian University (Abilene, TX)",y,y,TT,http://www.acu.edu/academics/provost/positions...
4,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,"LINK\n""Alfred State College invites applicatio...",http://www.higheredjobs.com/faculty/details.cf...,Alfred State College (NY),y,y,TT,http://www.higheredjobs.com/faculty/details.cf...
5,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,From MLA JIL (11/05)\nThe Department of Englis...,mailto:as_dean@aub.edu.lb | http://www.aub.edu...,American University of Beirut,,y,Unknown,mailto:as_dean@aub.edu.lb | http://www.aub.edu...
6,Rhetoric/Composition 2011,2011,Rhetoric/Composition_2011,Rhetoric/Composition,Link\nSubfield/description: 'The Department of...,http://www.higheredjobs.com/search/details.cfm...,"Angelo State University (San Angelo, TX)",y,y,TT,http://www.higheredjobs.com/search/details.cfm...
...,...,...,...,...,...,...,...,...,...,...,...
9309,Rhetoric/Composition 2020,2020,Rhetoric/Composition_2020,Rhetoric/Composition,The Department of English invites applications...,https://www.pacificu.edu/academics/academic-re...,Pacific University (USA:OR) - Visiting Assista...,n,y,Non-TT,https://www.pacificu.edu/academics/academic-re...
9310,Rhetoric/Composition 2020,2020,Rhetoric/Composition_2020,Rhetoric/Composition,The Department of English and Writing within t...,,University of Tampa (USA:FL) - Professor of In...,n,y,Non-TT,University of Tampa (USA:FL) - Professor of In...
9311,Rhetoric/Composition 2020,2020,Rhetoric/Composition_2020,Rhetoric/Composition,Location: Multiple Countries\nType: SeasonalMi...,,"Field Instructor, Rhetoric And Composition",n,y,Non-TT,"Field Instructor, Rhetoric And Composition"
9312,Rhetoric/Composition 2020,2020,Rhetoric/Composition_2020,Rhetoric/Composition,The Department of English Language and Literat...,https://jobs.weber.edu/postings/9890,Weber State University (USA:UT) - Visiting Ass...,n,y,Non-TT,https://jobs.weber.edu/postings/9890


In [41]:
df.IsUni.value_counts()

y    8876
      277
n     135
Name: IsUni, dtype: int64

In [42]:
df[df.IsUni==''].section_title

76                                                 M.I.T.
220                                  Job Listings 2010-11
221                                      Poetry Positions
405                                 AWP Interviews - Open
408                             Creative Writing Salaries
                              ...                        
9193                             Other Germanic Languages
9227                                               Other:
9260    Preliminary interview scheduled (please specif...
9262                                       Qualifications
9264                                       Qualifications
Name: section_title, Length: 277, dtype: object

In [43]:
df.IsTT.value_counts()

y    6929
     1197
n    1162
Name: IsTT, dtype: int64

In [44]:
df.JobType.value_counts()

TT         6929
Non-TT     1150
Unknown    1088
Postdoc     121
Name: JobType, dtype: int64

In [45]:
date=f'{now.year}-{now.month}-{now.day}'
ofn=f'data.jobcensus.rc.wiki.{date}.csv'
df.to_csv(ofn,index=False)