# Python for Webscraping
* SOC 590: Big Data and Population Processes
* 17th October 2016

## Tutorial 2: Webscraping with a function

In [1]:
import os
import urllib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

%matplotlib inline

In [2]:
def extract_page_data(table_rows):
    """
    Extract and return the the desired information from the td elements within
    the table rows.
    """
    # create the empty list to store the player data
    rows = []
    for row in soup.findAll('tr'):
        rows.append([val.text for val in row.find_all('td')])
        
    return rows[1:]


In [3]:
us_news_rankings = []
errors_list = []

In [4]:
url_template = 'http://grad-schools.usnews.rankingsandreviews.com/best-graduate-schools/top-humanities-schools/sociology-rankings/page+{page_number}'

# for each page from 1 to (and including) 4
for page_number in range(1, 5): 
    
    # Use try/except block to catch and inspect any urls that cause an error
    try:
        # get the draft url
        url = url_template.format(page_number=page_number)

        # get the html
        html = urllib.request.urlopen(url)

        # create the BeautifulSoup object
        soup = BeautifulSoup(html, "lxml") 

        # get the column headers
        headers = [header.text for header in soup.findAll('th')]
        
        # start etracting rows
        table_rows = soup.select('td')[1:]
        
        school_data = extract_page_data(table_rows)
        
        # create the dataframe for the current page
        school_df = pd.DataFrame(school_data, columns=headers)
        school_df = pd.DataFrame(school_data)

        # append the current dataframe to the list of dataframes
        us_news_rankings.append(school_df)
    
    except Exception as e:
        # Store the url and the error it causes in a list
        error =[url, e] 
        # then append it to the list of errors
        errors_list.append(error)
    

In [5]:
us_news_rankings

[                                                0  \
 0    \n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 1    \n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 2    \n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 3                 \n\ndblclick('rankingsEmbed')\n   
 4    \n\n#4Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 5    \n\n#4Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 6    \n\n#6Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 7    \n\n#6Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 8    \n\n#6Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 9       \n\n#9\n\n\nOverall Score: \n\n\n\n\n\n\n   
 10  \n\n#10Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 11  \n\n#10Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 12  \n\n#12Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 13  \n\n#12Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 14  \n\n#14Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 15  \n\n#14Tie\n\n\nOverall Score: \n\n\n\n\n\n\n   
 16     \n\n#16\n\n\nOverall Score: \n\n\n\n\n\n\n   
 17  \n\n#17Tie\n\n\nOverall

In [6]:
us_news_df_raw = pd.concat(us_news_rankings, axis=0)
column_headers = ["rank", "school", "score"]
us_news_df_raw.columns = column_headers

In [7]:
us_news_df_raw.head(10)

Unnamed: 0,rank,school,score
0,\n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,"\nPrinceton University \n\nPrinceton, NJ\n",\n\n \n\n \n\n ...
1,\n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of California—​Berkeley \n\nBerke...,\n\n \n\n \n\n ...
2,\n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of Wisconsin—​Madison \n\nMadison...,\n\n \n\n \n\n ...
3,\n\ndblclick('rankingsEmbed')\n,,
4,\n\n#4Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,"\nStanford University \n\nStanford, CA\n",\n\n \n\n \n\n ...
5,\n\n#4Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of Michigan—​Ann Arbor \n\nAnn Ar...,\n\n \n\n \n\n ...
6,\n\n#6Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,"\nHarvard University \n\nCambridge, MA\n",\n\n \n\n \n\n ...
7,\n\n#6Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,"\nUniversity of Chicago \n\nChicago, IL\n",\n\n \n\n \n\n ...
8,\n\n#6Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of North Carolina—​Chapel Hill \n...,\n\n \n\n \n\n ...
9,\n\n#9\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of California—​Los Angeles \n\nLo...,\n\n \n\n \n\n ...


In [8]:
if not os.path.exists('../data/raw_data'):
    os.makedirs('../data/raw_data')

if not os.path.exists('../data/clean_data'):
    os.makedirs('../data/clean_data')
    

In [9]:
# Write out the raw rankings data to the raw_data folder in the data folder
us_news_df_raw.to_csv("../data/raw_data/us_news_rankings_RAW.csv", index=False)

In [10]:
us_news_df_raw = pd.read_csv("../data/raw_data/us_news_rankings_RAW.csv")
us_news_df_raw = us_news_df_raw[0:len(us_news_df_raw)]
us_news_df_raw.head()

Unnamed: 0,rank,school,score
0,\n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,"\nPrinceton University \n\nPrinceton, NJ\n",\n\n \n\n \n\n ...
1,\n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of California—​Berkeley \n\nBerke...,\n\n \n\n \n\n ...
2,\n\n#1Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,\nUniversity of Wisconsin—​Madison \n\nMadison...,\n\n \n\n \n\n ...
3,\n\ndblclick('rankingsEmbed')\n,,
4,\n\n#4Tie\n\n\nOverall Score: \n\n\n\n\n\n\n,"\nStanford University \n\nStanford, CA\n",\n\n \n\n \n\n ...


In [11]:
us_news_df_raw["school_location"] = "NaN"
us_news_df_raw["school_name"] = "NaN"
us_news_df_raw["rank"] = us_news_df_raw.loc[:,('rank')].replace(r"\D",  "", regex = True)
us_news_df_raw["score"] = us_news_df_raw.loc[:,('score')].str.extract("(\d.\d)", expand=False)

In [12]:
us_news_df_raw.head(10)

Unnamed: 0,rank,school,score,school_location,school_name
0,1.0,"\nPrinceton University \n\nPrinceton, NJ\n",4.7,,
1,1.0,\nUniversity of California—​Berkeley \n\nBerke...,4.7,,
2,1.0,\nUniversity of Wisconsin—​Madison \n\nMadison...,4.7,,
3,,,,,
4,4.0,"\nStanford University \n\nStanford, CA\n",4.6,,
5,4.0,\nUniversity of Michigan—​Ann Arbor \n\nAnn Ar...,4.6,,
6,6.0,"\nHarvard University \n\nCambridge, MA\n",4.5,,
7,6.0,"\nUniversity of Chicago \n\nChicago, IL\n",4.5,,
8,6.0,\nUniversity of North Carolina—​Chapel Hill \n...,4.5,,
9,9.0,\nUniversity of California—​Los Angeles \n\nLo...,4.4,,


In [13]:
for i in range(0,len(us_news_df_raw)+1):
    try: 
        us_news_df_raw["school_name"][i] = us_news_df_raw["school"].str.split("\n\n")[i][0]
        us_news_df_raw["school_location"][i] = us_news_df_raw["school"].str.split("\n\n")[i][1]
    except:
        us_news_df_raw["school_name"][i] = "NaN"
        us_news_df_raw["school_location"][i] = "NaN"
us_news_df_raw["school_name"] = us_news_df_raw.loc[:,('school_name')].replace(r"\n",  "", regex = True)
us_news_df_raw["school_location"] = us_news_df_raw.loc[:,('school_location')].replace("\n",  "", regex = True)

In [14]:
cols = ["rank", "school_name", "school_location"]
us_news_df_raw = us_news_df_raw[cols]
us_news_df_raw.head()

Unnamed: 0,rank,school_name,school_location
0,1.0,Princeton University,"Princeton, NJ"
1,1.0,University of California—​Berkeley,"Berkeley, CA"
2,1.0,University of Wisconsin—​Madison,"Madison, WI"
3,,,
4,4.0,Stanford University,"Stanford, CA"


In [15]:
us_news_df_clean = us_news_df_raw[us_news_df_raw['school_name']!="NaN"]

In [16]:
us_news_df_clean.head()

Unnamed: 0,rank,school_name,school_location
0,1,Princeton University,"Princeton, NJ"
1,1,University of California—​Berkeley,"Berkeley, CA"
2,1,University of Wisconsin—​Madison,"Madison, WI"
4,4,Stanford University,"Stanford, CA"
5,4,University of Michigan—​Ann Arbor,"Ann Arbor, MI"


In [17]:
us_news_df_clean.to_csv("../data/clean_data/us_news_rankings_clean.csv")