# Homework 2 - Data from the web

### Useful Imports

In [4]:
# Import libraries
%matplotlib inline
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
import sys
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.

## 1 - Top Universities

In particular, extract the following fields for each university: name, rank, country and region, number of faculty members (international and total) and number of students (international and total)

In [5]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508057903494')
root_url = "https://www.topuniversities.com/"

In [6]:
def progress(count, total, suffix=''):
    """ Shows the progress of a given action 
    
    @params:
    - count : the current count of done operations
    - total : the total number of operation to do
    - suffix : a message printed after the progress bar
    """
    
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ... %s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()  # As suggested by Rom Ruben

In [7]:
def get_number(enclosing_class_name,url,det_soup):
    """Tries to extract the number we look for. Will return a NaN 
    if the details page doesn't contain such a number
    
    @params:
    - enclosing_class_name : the class of the div into which the number is contained.
    - url : the suffix url of the school (to print an error if not found)
    - det_soup : the soup of the details page for the given school
    
    """
    try:
        n = det_soup.find("div",class_=enclosing_class_name).find("div",class_="number").text.strip('\n')
    except(AttributeError) as e:
        print("Couldn't find '{}' for {}".format(enclosing_class_name,url))
        n = 'NAN'
    return n

def get_details(url):
    """Finds the detail of a given university on the details page
    
    @params:
    - url : the suffix url of the universirty of interest
    
    """
    details_url = root_url + url
    det_request = requests.get(details_url)
    det_soup = BeautifulSoup(det_request.text, 'lxml')
    labels = ['total faculty','inter faculty','total student','total inter']

    staff_total,staff_inter,student_total,student_inter = [get_number(label,url,det_soup) for label in labels]
    
    return staff_total,staff_inter,student_total,student_inter

In [8]:
def get_ranking_dataframe(path_to_dump): 
    if(os.path.isfile(path_to_dump)):
        return pd.read_pickle(path_to_dump)

    top_200 = []

    for (i,uni_dict) in enumerate(r.json()['data']):
        if(i<200):
            title = uni_dict['title']
            rank = uni_dict['rank_display']
            country = uni_dict['country']
            region = uni_dict['region']
            url = uni_dict['url']

            # In case of tie, the rank is prefixed with an '=', we get rid of it.
            rank = rank.strip('=')

            # We then get the details from the details page
            staff_total,staff_inter,student_total,student_inter = get_details(url)
            progress(i+1,200,'Scraping the info')

            top_200.append({'School Name': title, 
                            'Rank': rank, 
                            'Country': country,
                            'Region': region,
                            'Total Staff': staff_total,
                            'International Staff': staff_inter,
                            'Total Student': student_total,
                            'International Student': student_inter
                           })
            
    df = pd.DataFrame.from_dict(top_200)
    
    # We set Rank as an index
    df.set_index('Rank',inplace=True)
    
    # Then we convert the number columns to integers
    cols = ['Total Staff','International Staff','Total Student','International Student']
    df[cols] = df[cols].apply(lambda x: pd.to_numeric(x.astype(str)
                                                       .str.replace(',',''), errors='coerce',downcast='integer'))

    # We return the dataframe ordered as we please
    df = df[['School Name','Total Staff','International Staff','Total Student','International Student','Country','Region']]
    
    # We serialize it using pickle so that we do not have to download it again
    df.to_pickle(path_to_dump)
    return df

In [9]:
df = get_ranking_dataframe('topuniversities_backup')
df.head()

Unnamed: 0_level_0,School Name,Total Staff,International Staff,Total Student,International Student,Country,Region
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Massachusetts Institute of Technology (MIT),2982,1679.0,11067,3717,United States,North America
2,Stanford University,4285,2042.0,15878,3611,United States,North America
3,Harvard University,4350,1311.0,22429,5266,United States,North America
4,California Institute of Technology (Caltech),953,350.0,2255,647,United States,North America
5,University of Cambridge,5490,2278.0,18770,6699,United Kingdom,Europe


### 1.a - Best university in terms of ratio faculty members over students

In order to determine which is the best university regarding ratio between faculty members and students, we create a new column called ```Fac/stud ratio```. Then we sort the rows according to values in this column. (because we were asked for the best universities we display the ranking according to this value)

In [85]:
df['Fac/stud ratio'] = df['Total Staff']/df['Total Student']
sorted_by_facStudRatio = df.sort_values('Fac/stud ratio',ascending=False)
sorted_by_facStudRatio.head()

Unnamed: 0_level_0,School Name,Total Staff,International Staff,Total Student,International Student,Country,Region,International ratio,Fac/stud ratio
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4,California Institute of Technology (Caltech),953,350.0,2255,647,United States,North America,0.286918,0.422616
16,Yale University,4940,1708.0,12402,2469,United States,North America,0.199081,0.398323
6,University of Oxford,6750,2964.0,19720,7353,United Kingdom,Europe,0.37287,0.342292
5,University of Cambridge,5490,2278.0,18770,6699,United Kingdom,Europe,0.356899,0.292488
17,Johns Hopkins University,4462,1061.0,16146,4105,United States,North America,0.254243,0.276353


### 1.b - Best university in terms of ratio of international students

We use strategy that is exactly the same than for the 1.a

In [87]:
df['International ratio'] = df['International Student']/df['Total Student']
sorted_by_internationalRatio = df.sort_values('International ratio',ascending=False)
sorted_by_internationalRatio.head()

Unnamed: 0_level_0,School Name,Total Staff,International Staff,Total Student,International Student,Country,Region,International ratio,Fac/stud ratio
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
35,London School of Economics and Political Scien...,1088,687.0,9760,6748,United Kingdom,Europe,0.691393,0.111475
12,Ecole Polytechnique Fédérale de Lausanne (EPFL),1695,1300.0,10343,5896,Switzerland,Europe,0.570047,0.163879
8,Imperial College London,3930,2071.0,16090,8746,United Kingdom,Europe,0.543567,0.244251
200,Maastricht University,1277,502.0,16385,8234,Netherlands,Europe,0.502533,0.077937
47,Carnegie Mellon University,1342,425.0,13356,6385,United States,North America,0.478062,0.100479


### 1.c - Aggregated by country

Because it wasn't so clear how we should aggregate the data, we decided to consider the ```mean()``` as aggregation function which will give us the ranking of countries according to :
* ratio of faculty staff over students
* ratio of international students

In [95]:
best_countries_by_facStudRatio = df.groupby('Country').mean().sort_values('Fac/stud ratio',ascending=False)
best_countries_by_facStudRatio[['Fac/stud ratio']].head()

Unnamed: 0_level_0,Fac/stud ratio
Country,Unnamed: 1_level_1
Russia,0.22191
Denmark,0.18658
Saudi Arabia,0.175828
Singapore,0.162279
Japan,0.15584


In [96]:
best_countries_by_internationalRatio = df.groupby('Country').mean().sort_values('International ratio',ascending=False)
best_countries_by_internationalRatio[['International ratio']].head()

Unnamed: 0_level_0,International ratio
Country,Unnamed: 1_level_1
United Kingdom,0.351308
Australia,0.346878
Switzerland,0.313816
Hong Kong,0.312148
Austria,0.306095


### 1.d - Aggregated by region

Very similar strategy than the one used in 1.c

In [97]:
best_regions_by_facStudRatio = df.groupby('Region').mean().sort_values('Fac/stud ratio',ascending=False)
best_regions_by_facStudRatio[['Fac/stud ratio']].head()

Unnamed: 0_level_0,Fac/stud ratio
Region,Unnamed: 1_level_1
North America,0.145407
Asia,0.134673
Europe,0.120003
Latin America,0.096779
Africa,0.08845


In [99]:
best_regions_by_internationalRatio = df.groupby('Region').mean().sort_values('International ratio',ascending=False)
best_regions_by_internationalRatio[['International ratio']].head()

Unnamed: 0_level_0,International ratio
Region,Unnamed: 1_level_1
Oceania,0.329077
Europe,0.245932
North America,0.203583
Africa,0.169703
Asia,0.132394


# todo : bar plots for the results

## 2 - Times Higher Education