# clean rankings
This code converts the downloaded html pages with subject-specific QS World University Rankings into nice clean tabular data.

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm import tqdm

In [None]:
files = os.listdir('rankings')
len(files)

In [None]:
data = []
for f in tqdm(files):
    if f.endswith('.html'):
        #read html
        with open('rankings/'+f, 'r') as o:
            soup = BeautifulSoup(o.read(), 'html.parser')
            
        # get field name
        field = f.split('_')[1]
            
        #select the right table
        soup = soup.find(id='indicator-tab')
        
        #select the rows of our table
        rows = soup.find_all("div", class_="row ind-row")

        #check for pages that didn't load before they were saved. You'll want to redownload these
        if len(rows) ==0:
            print(field, "has no rows for one page")
            
        #iterate through the pages
        for r in rows:
            tmp = dict()
            try:
                tmp['name'] = r.find('a', class_='uni-link').text.strip()  
                tmp['field'] = field
                tmp['rank'] = r.find('div', class_='_univ-rank').text.strip()        
                tmp['overall'] = r.find('div', class_='overall-score-span-ind overall').text.strip()        
                tmp['h_index'] = r.find('div', class_='overall-score-span-ind ind_69').text.strip()        
                tmp['cites_per_paper'] = r.find('div', class_='overall-score-span-ind ind_70').text.strip()        
                tmp['academic_rep'] = r.find('div', class_='overall-score-span-ind ind_76').text.strip()        
                tmp['employer_rep'] = r.find('div', class_='overall-score-span-ind ind_77').text.strip()
            except:
                #uninformative warning that the HTML got garbled
                print('row failed')

            data.append(tmp)

In [None]:
# make a dataframe
data = pd.DataFrame(data)
data.shape

In [None]:
data.head()

In [None]:
# make the numbers into numbers
ncols = ['overall', 'h_index', 'cites_per_paper', 'academic_rep', 'employer_rep']
for c in ncols:
    data[c] = pd.to_numeric(data[c], downcast='float')
data.dtypes

In [None]:
#save the result as a tab separated file. (excel etc. can open these)
data.to_csv('uni_rank_by_field.tsv', sep='\t', index=False)