## MATH-GA 2047 Project 1 Wiki Scraping
## Author: Haonan Tian 
## Date: 09/22/2018

In [96]:
# Initialization
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver 
import requests
import string 
import time

In [31]:
# Url for web scraping
url_2000 = 'https://en.wikipedia.org/wiki/United_States_presidential_election,_2000'
url_2004 = 'https://en.wikipedia.org/wiki/United_States_presidential_election,_2004'

### Create Fake User Agent

In [None]:
ua = UserAgent()

# Check the Chrome user agent 
ua.Chrome

### Set Up the Selenium Web Driver

In [83]:
#driver = webdriver.Chrome('~/User/haonantian/Applications/Google Chrome.app')
driver = webdriver.Safari()

### Create a Fake Header

In [100]:
header = {'User-Agent': str(ua.Chrome)}
header

{'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36'}

### Load Website

In [858]:
content_2000 = requests.get(url_2000, headers = header)
content_2000.text[:1000]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>United States presidential election, 2000 - Wikipedia</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"United_States_presidential_election,_2000","wgTitle":"United States presidential election, 2000","wgCurRevisionId":861291720,"wgRevisionId":861291720,"wgArticleId":32009,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages using citations with accessdate and no URL","CS1 maint: Unfit url","Use mdy dates from December 2017","Pages using deprecated image syntax","Elections using electoral votes","Articles needing additional references from January 201

In [859]:
soup_2000 = BeautifulSoup(content_2000.text, 'html.parser')

# Write the prettified web source code to a html file for recrods
fout = open('content_2000.html','w')
fout.write(soup_2000.prettify())
fout.close()

### Start Parsing Data for Year 2000 

In [860]:
# Load the table of election result to soup object
table_2000 = soup_2000.find('table', {'class': 'wikitable sortable', 'style': 'text-align:right'})
table_2000.prettify()

'<table class="wikitable sortable" style="text-align:right">\n <tbody>\n  <tr>\n   <th colspan="2">\n   </th>\n   <th colspan="3" style="text-align:center;">\n    George W. Bush\n    <br/>\n    Republican\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Al Gore\n    <br/>\n    Democratic\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Ralph Nader\n    <br/>\n    Green\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Pat Buchanan\n    <br/>\n    Reform\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Harry Browne\n    <br/>\n    Libertarian\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Howard Phillips\n    <br/>\n    Constitution\n   </th>\n   <th colspan="3" style="text-align:center;">\n    John Hagelin\n    <br/>\n    Natural Law\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Others\n   </th>\n   <th colspan="2" style="text-align:center;">\n    Margin\n   </th>\n   <th colspan="2" style="text-ali

In [861]:
# Find all state info by their rows and background colors
state_2000_dem = table_2000.find_all('tr', {'style': 'background:#B0CEFF'}) 
state_2000_gop = table_2000.find_all('tr', {'style': 'background:#FFB6B6'})
print(len(state_2000_dem))
print(len(state_2000_gop))

21
30


In [102]:
def set_printable(inputStr):  # Helper function to remove special characters from state names
    printable = set(string.printable)
    results = filter(lambda x: x in printable, inputStr)
    outStr = ''
    for letter in results:
        outStr += letter
    return outStr

In [862]:
state_names_gop = []
for row in state_2000_gop:
    state = row.find('td', {'scope': 'row'}).get_text().lstrip().strip()
    state_names_gop.append(set_printable(state))
print(len(state_names_gop))

state_names_dem = []
for row in state_2000_dem:
    state = row.find('td', {'scope': 'row'}).get_text().lstrip().strip()
    state_names_dem.append(set_printable(state))
print(len(state_names_dem))
state_names_all = state_names_gop + state_names_dem
print(len(state_names_all))
for name in state_names_all:
    print(name)

30
21
51
Alabama
Alaska
Arizona
Arkansas
Colorado
Florida
Georgia
Idaho
Indiana
Kansas
Kentucky
Louisiana
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
North Carolina
North Dakota
Ohio
Oklahoma
South Carolina
South Dakota
Tennessee
Texas
Utah
Virginia
West Virginia
Wyoming
California
Connecticut
Delaware
D.C.
Hawaii
Illinois
Iowa
Maine
Maryland
Massachusetts
Michigan
Minnesota
New Jersey
New Mexico
New York
Oregon
Pennsylvania
Rhode Island
Vermont
Washington
Wisconsin


In [863]:
def convert_list_for_path(inputList):
    result = []
    for item in inputList:
        if item == 'D.C.':
            result.append('the District of Columbia')
        elif item == 'District of Columbia':
            result.append('the District of Columbia')
        else:
            result.append(item)
    return result

state_name_path = convert_list_for_path(state_names_all)
state_urls = []
driver = webdriver.Safari()
driver.get(url_2000)
for state_name in state_name_path:
    driver.find_element_by_xpath("//a[@title='United States presidential election in " + state_name + ", 2000']").click()
    time.sleep(1)
    state_urls.append(driver.current_url)
    print(driver.current_url)
    time.sleep(1)
    #link.click()
    driver.back()
print(len(state_urls))  

https://en.wikipedia.org/wiki/United_States_presidential_election_in_Alabama,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Alaska,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Arizona,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Arkansas,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Colorado,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Florida,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Georgia,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Idaho,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Indiana,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Kansas,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Kentucky,_2000
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Louisiana,_2000
htt

### Parse Election Data for Each State

In [865]:
# Save all html by state
for i in range(len(state_urls)):
    state_name = state_names_all[i]
    content_state = requests.get(state_urls[i], headers = header)
    soup_state = BeautifulSoup(content_state.text, 'html.parser')
    fileName = state_name + '_2000.html'
    fout = open(fileName,'w')
    fout.write(soup_state.prettify())
    fout.close()
    print('Finish state ' + str(state_name))

Finish state Alabama
Finish state Alaska
Finish state Arizona
Finish state Arkansas
Finish state Colorado
Finish state Florida
Finish state Georgia
Finish state Idaho
Finish state Indiana
Finish state Kansas
Finish state Kentucky
Finish state Louisiana
Finish state Mississippi
Finish state Missouri
Finish state Montana
Finish state Nebraska
Finish state Nevada
Finish state New Hampshire
Finish state North Carolina
Finish state North Dakota
Finish state Ohio
Finish state Oklahoma
Finish state South Carolina
Finish state South Dakota
Finish state Tennessee
Finish state Texas
Finish state Utah
Finish state Virginia
Finish state West Virginia
Finish state Wyoming
Finish state California
Finish state Connecticut
Finish state Delaware
Finish state D.C.
Finish state Hawaii
Finish state Illinois
Finish state Iowa
Finish state Maine
Finish state Maryland
Finish state Massachusetts
Finish state Michigan
Finish state Minnesota
Finish state New Jersey
Finish state New Mexico
Finish state New York


In [879]:
# module 1 of helper functions to set up bench marks for finding parties
def makeup_header(state_name):
    if state_name == 'North Carolina':
        return 'United States presidential election in NC, 2000'
    elif state_name == 'D.C.':
        return 'United States presidential election in the District of Columbia, 2000'
    elif state_name == 'Massachusetts':
        return 'United States presidential election in Massachusetts, 2004'
    elif state_name == 'Alabama':
        return 'United States presidential election in Alabama,'
    else:
        return 'United States presidential election in '+ state_name +', 2000'

def find_bench_table(tables):
    count = 1
    for item in tables:
        head = ''
        try:
            head = item.tbody.tr.th.get_text().lstrip().strip().split('[')[0]
        except:
            head = ''
        if head == makeup_header(state_name):
            print('Table found')
            return item
        #print('Finished paersing table ' + str(count))
        count += 1
    return 'Table not found'

def makeup_bench_mark(bench_table):
    result = {}
    count = 2
    rows = bench_table.find_all('tr')
    while count < len(rows)-1:
        tds = rows[count].find_all('td')
        party = tds[0].get_text().lstrip().strip()
        name = tds[1].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def find_bench_for_Missori(soup_state):
    result = {}
    table = soup_state.find('table', {'class': 'wikitable'})
    trs = table.find_all('tr')
    count = 1
    while count < len(trs)-1:
        tds = trs[count].find_all('td')
        name = tds[0].get_text().lstrip().strip()
        party = tds[2].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def find_bench_Alabama(soup_state):
    result = {}
    table = soup_state.find('table',{'border':1})
    trs = table.find_all('tr')
    count = 2
    while count < len(trs)-2:
        tds = trs[count].find_all('td')
        name = tds[1].get_text().lstrip().strip()
        party = tds[0].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result        

In [867]:
# Module 2 of helper functions to find the 'Total' column and total number of columns
def find_format(table):
    total_cols = {}
    rows = table.find_all('tr')
    col_num = 0
    ths = rows[0].find_all('th')
    total_cols['Cols'] = len(ths)
    col_names = []
    for item in ths:
        if item.get_text().lstrip().strip() == 'Total' or item.get_text().lstrip().strip() == 'Total#' or item.get_text().lstrip().strip() == 'Total #':
            total_cols['Total'] = col_num
            col_num += 1
        elif item.get_text().lstrip().strip().split('[')[0] != 'County' and item.get_text().lstrip().strip().split('[')[0] != 'Parish' and item.get_text().lstrip().strip().split('[')[0] != 'County/City' and item.get_text().lstrip().strip().split('[')[0] != 'County or City':
            col_names.append(item.get_text().lstrip().strip().split('[')[0])
            col_num += 1
        else:
            col_num += 1
    if 'Total' not in list(total_cols.keys()):
        total_cols['Total'] = -1
    return total_cols, col_names

def secure_col_name(col_name, county_temp):
    count = 0
    #print(county_temp)
            
    if type(county_temp[1]) == str:
        while count < len(col_name):
            if count % 2 == 0 and col_name[count][-1] != '%':
                col_name[count] += '%'
                count += 1
            elif count % 2 == 1 and col_name[count] == 'Votes':
                col_name[count] = col_name[count-1].strip('%') + '#'
            elif count % 2 == 1 and col_name[count][-1] != '#':
                col_name[count] += '#'
                count += 1
            else:
                count += 1
    else:
        while count < len(col_name):
            if count % 2 == 0 and col_name[count] == 'Votes':
                col_name[count] = col_name[count + 1].strip('%') + '#'
            elif count % 2 == 0 and col_name[count][-1] != '#':
                col_name[count] += '#'
                count += 1
            elif count % 2 == 1 and col_name[count][-1] != '%':
                col_name[count] += '%'
                count += 1
            else:
                count += 1

In [868]:
# Module 3 of helper functions to set up list of [state_name, county_name, {data_dictionary}]
# Note: data_dictionary is in the form: {Name#: value, Name%: percentages}
def makeup_state_list(state_name, county_temp, col_names):
    result = [state_name, county_temp[0]] # Create a list with state name and county name 
    data_dict = {}
    counter = 1
    for item in col_names:
        data_dict[item] = county_temp[counter]
        counter += 1
    result.append(data_dict)
    return result
        
def convert_string_to_number(inputStr):
    if inputStr == "'4,353'":
        return 4353
    
    if inputStr[-1] != '%':
        try:
            return int(inputStr.replace(',',''))
        except:
            return inputStr + '%'
    else:
        return inputStr

In [869]:
# Module 4 of helper functions to make up columns and set up party strings according to candidates
def sort_dict(input_dict):
    try:
        return sorted(input_dict.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
    except:
        for key, value in input_dict.items():
            try:
                value = float(value)
            except:
                temp = float(value.strip('%'))
                input_dict[key] = temp
        return sorted(input_dict.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)

def make_up_columns(data_dict):
    votes = {}
    pct = {}
    #if state_name == 'Illinois':
        #print(data_dict)
    for item, value in data_dict.items():
        if item[-1] == '#':
            votes[item.strip('#')] = value
        else:
            pct[item.strip('%')] = value
    candidate_str = ''
    votes_str = ''
    pct_str = ''
    sorted_candidate = sort_dict(votes)
    count = 0
    while count < 3:
        candidate_str += sorted_candidate[count][0] + ' / '
        votes_str += str(votes[sorted_candidate[count][0]]) + ' / '
        pct_str += str(pct[sorted_candidate[count][0]]) + ' / '
        count += 1
    return candidate_str.strip(' / '), votes_str.strip(' / '), pct_str.strip(' / ')

def check_in_bench_mark(bench_mark, item):
    for key in list(bench_mark.keys()):
        if item in key:
            return bench_mark[key]
    return 'None'

def makeup_party(candidates, bench_mark):
    result = ''
    candidate_list = candidates.split(' / ')
    for item in candidate_list:
        if item == 'Others' or item == 'Other':
            result += 'Others' + ' / '
        else:
            result += check_in_bench_mark(bench_mark, item) + ' / '
    return result.strip(' / ')

In [893]:
def find_bench_florida(soup_state):
    result = {}
    bench_table = soup_state.find('table', {'class': 'wikitable'})
    trs = bench_table.find_all('tr')
    count = 1
    while count < len(trs) - 2:
        tds = trs[count].find_all('td')
        name = tds[0]
        party = tds[3]
        result[name] = party
        count += 1
    return result

def operate_for_florida(soup_state, state_name): # Multiple Headers
    state_result = []
    len_to_stop = 11
    table = soup_state.find_all('table', {'class': 'wikitable sortable'})[0]
    col_names = ['Bush#','Bush%','Gore#','Gore%','Nader#','Nader%','Buchanan#','Buchanan%','Other#','Other%']
    if state_name == 'New Mexico':
        col_names = ['Bush%','Bush#','Gore%','Gore#','Nader%','Nader#','Other%','Other#']
        len_to_stop = 9
    trs = table.find_all('tr')
    count = 2
    while count < len(trs) - 1:
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len_to_stop):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

def operate_for_Idaho(soup_state, state_name): # Multiple Headers
    state_result = []
    table = soup_state.find_all('table', {'class': 'wikitable sortable'})[0]
    col_names = ['Bush#','Bush%','Gore#','Gore%','Nader#','Nader%','Other#','Other%']
    trs = table.find_all('tr')
    count = 2
    while count < len(trs) - 1:
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(11):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    #print(state_result)
    return state_result

def operate_for_Indiana(soup_state, state_name):
    state_result = []
    table = soup_state.find('table', {'class': 'wikitable sortable'})
    col_names = ['Gore%','Gore#','Bush%','Bush#','Others%','Others#']
    trs = table.find_all('tr')
    count = 1
    while count < 9:
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

In [871]:
def operate_for_Delaware(soup_state, state_name):
    state_result = []
    table = soup_state.find_all('table', {'class': 'wikitable'})[0]
    col_names = ['Gore%','Gore#','Bush%','Bush#','Others%','Others#']
    trs = table.find_all('tr')
    count = 2
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

In [872]:
def operate_for_Hawaii(soup_state, state_name):
    state_result = []
    table = soup_state.find_all('table',{'class':'wikitable'})[0]
    col_names = ['Gore%','Gore#','Bush%','Bush#','Others%','Others#']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            elif i == len(tds) - 1:
                county_temp.append(int(county_temp[i-2] / (float(county_temp[i-3].strip('%'))/100) * (float(county_temp[i-1].strip('%'))/100)))
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

In [873]:
def find_bench_RhodeIsland():
    return {'Gore': 'Democratic', 'Bush': 'Republican'}

def operate_for_RhodeIsland(soup_state, state_name):
    state_result = []
    table = soup_state.find_all('table', {'class':'wikitable'})[2]
    col_names = ['Gore%','Gore#','Bush%','Bush#','Others%','Others#']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result   

In [874]:
def operate_for_Washington(soup_state, state_name):
    state_result = []
    table = soup_state.find('table', {'class': 'sortable'})
    col_names = ['Gore#','Gore%','Bush#','Bush%','Others#','Others%']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            elif i == 1:
                continue
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

In [884]:
def operate_for_Alabama(soup_state, state_name):
    state_result = []
    table = soup_state.find('table', {'class':'wikitable sortable'})
    col_names = ['Gore#','Gore%','Bush#','Bush%','Browne#','Browne%']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        for i in range(8):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

In [894]:
national_info_2000 = []
for i in range(len(state_names_all)):
    state_name = state_names_all[i]
    print('Start state ' + state_name)
    if state_name == 'Alaska':
        print('Finished State Alaska\n')
        continue
    if state_name == 'Kentucky':
        print('Finished State Kentucky\n')
        continue 
    if state_name == 'South Dakota':
        print('Finished State South Dakota\n')
        continue
    if state_name == 'D.C.':
        print('Finished State South the District of Columbia\n')
        continue
    if state_name == 'Maryland':
        print('Finished State South Maryland\n')
        continue
    content_state = requests.get(state_urls[i], headers = header)
    soup_state = BeautifulSoup(content_state.text, 'html.parser')
    print('good for initialization')
    
    # Set up bench mark
    bench_mark = {}
    if state_name == 'Missouri':
        bench_mark = find_bench_for_Missori(soup_state)
    elif state_name == 'Wisconsin':
        table_bench = soup_state.find('table',{'border':1})
        bench_mark = makeup_bench_mark(table_bench)
    elif state_name == 'Florida':
        bench_mark = find_bench_florida(soup_state)
    elif state_name == 'Rhode Island':
        bench_mark = find_bench_RhodeIsland()
    elif state_name == 'Alabama':
        bench_mark = find_bench_Alabama(soup_state)
    else:
        tables = soup_state.find_all('table')
        bench_table = find_bench_table(tables)
        bench_mark = makeup_bench_mark(bench_table)
    print('good for setting bench mark')
    
    # Run main Program
    state_result = []
    if state_name == 'Florida' or state_name == 'Utah' or state_name == 'Oregon' or state_name == 'New Mexico':
        state_result = operate_for_florida(soup_state, state_name)
    elif state_name == 'Idaho':
        state_result = operate_for_Idaho(soup_state, state_name)
    elif state_name == 'Indiana':
        state_result = operate_for_Indiana(soup_state, state_name)
    elif state_name == 'Delaware':
        state_result = operate_for_Delaware(soup_state, state_name)
    elif state_name == 'Hawaii':
        state_result = operate_for_Hawaii(soup_state, state_name)
    elif state_name == 'Rhode Island':
        state_result = operate_for_RhodeIsland(soup_state, state_name)
    elif state_name == 'Washington':
        state_result = operate_for_Washington(soup_state, state_name)
    elif state_name == 'Alabama':
        state_result = operate_for_Alabama(soup_state, state_name)
    else:
        table = soup_state.find('table', {'class': 'wikitable sortable'})
        indicator, col_names = find_format(table) # find (indicator)1.total number of columns 2.the No. of 'Total' (col_names) list of candidates
        total_count = 0
        print('good for finding format')
    
        for row in table.find_all('tr'):
            if total_count != 0: # Skip the first row of table
                county_temp = []
                tds = row.find_all('td')
                td_count = 0
                while td_count < len(tds):
                    if td_count == 0:
                        county_temp.append(tds[td_count].get_text().lstrip().strip())
                        td_count += 1
                    elif td_count != indicator['Total']:
                        county_temp.append(convert_string_to_number(tds[td_count].get_text().lstrip().strip().replace('\xa0','').replace(' ','')))
                        td_count += 1
                    else:
                        td_count += 1
                secure_col_name(col_names, county_temp)
                for name in col_names:
                    name = name.replace(' ', '')
                    name = name.replace('\xa0', '')
                    name = name.strip(' #')
                if state_name == 'Alabama':
                    print(county_temp)
                    print(col_names)
                if state_name == 'New Jersey':
                    col_names = ['Gore#', 'Gore%','Bush#','Bush%','Other#','Other%']
                if state_name == 'Nevada':
                    col_names = ['Gore%','Gore#','Bush%','Bush#','Others%','Others#']
                state_result.append(makeup_state_list(state_name, county_temp, col_names))
                total_count += 1
            else:
                total_count += 1
        #print(state_result)
        #print('good for getting info from table')
    
    # convert state_result to list for further creation of data frame
    state_result_list = []
    count1 = 0
    #print(state_result[0])
    for county in state_result:
        state_temp = []
        state_county = county[0] + ' - ' + county[1]
        candidates_col, votes_col, pct_col = make_up_columns(county[2])
        party_col = makeup_party(candidates_col, bench_mark)
        state_temp = [state_county, candidates_col, votes_col, pct_col, party_col]
        state_result_list.append(state_temp)
        count1+=1
    print('good for setting up list')
    national_info_2000 = national_info_2000 + state_result_list
    print('Finished state ' + state_name + '\n')
print('Done')

Start state Alabama
good for initialization
good for setting bench mark
good for setting up list
Finished state Alabama

Start state Alaska
Finished State Alaska

Start state Arizona
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Arizona

Start state Arkansas
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Arkansas

Start state Colorado
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Colorado

Start state Florida
good for initialization
good for setting bench mark
good for setting up list
Finished state Florida

Start state Georgia
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Georgia

Start state Idaho
good for initialization
Table found
good for setting bench mark
go

In [898]:
national_df_2000 = pd.DataFrame(national_info_2000)
national_df_2000 = national_df_2000.rename(columns = {0:'State-County', 1:'1st, 2nd, 3rd', 2:'votes1, votes2, votes3', 3:'pct1, pct2, pct3', 4:'party1, party2, party3'}) 
national_df_2000.head(500)

Unnamed: 0,State-County,"1st, 2nd, 3rd","votes1, votes2, votes3","pct1, pct2, pct3","party1, party2, party3"
0,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
1,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
2,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
3,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
4,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
5,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
6,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
7,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
8,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent
9,Alabama - Warren,Bush / Gore / Browne,22172 / 16543 / 2086,54.3% / 40.6% / 5.1%,Republican / Democratic / Independent


In [899]:
national_df_2000.to_csv('election_2000.csv')

### Start Parsing Data for Year 2004

In [702]:
content_2004 = requests.get(url_2004, headers = header)
soup_2004 = BeautifulSoup(content_2004.text, 'html.parser')

# Write the prettified web source code to a html file for recrods
fout = open('content_2004.html','w')
fout.write(soup_2004.prettify())
fout.close()

In [703]:
# Load the table of election result to soup object
table_2004 = soup_2004.find('table', {'class': 'wikitable sortable', 'style': 'text-align:right'})
table_2004.prettify()

'<table class="wikitable sortable" style="text-align:right">\n <tbody>\n  <tr>\n   <th colspan="2">\n   </th>\n   <th colspan="3" style="text-align:center;">\n    George W. Bush\n    <br/>\n    Republican\n   </th>\n   <th colspan="3" style="text-align:center;">\n    John Kerry\n    <br/>\n    Democratic\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Ralph Nader\n    <br/>\n    Independent / Reform\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Michael Badnarik\n    <br/>\n    Libertarian\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Michael Peroutka\n    <br/>\n    Constitution\n   </th>\n   <th colspan="3" style="text-align:center;">\n    David Cobb\n    <br/>\n    Green\n   </th>\n   <th colspan="3" style="text-align:center;">\n    Others\n   </th>\n   <th colspan="2" style="text-align:center;">\n    Margin\n   </th>\n   <th colspan="2" style="text-align:center;">\n    State Total\n   </th>\n  </tr>\n  <tr>\n   <th align="center">\n  

In [706]:
# Find all state info by their rows and background colors
state_2004_dem = table_2004.find_all('tr', {'style': 'background:#B0CEFF'}) 
state_2004_gop = table_2004.find_all('tr', {'style': 'background:#FFB6B6'})
print(len(state_2004_dem))
print(len(state_2004_gop))
print(state_2004_dem)

20
31
[<tr style="background:#B0CEFF">
<td style="text-align:left;"><a href="/wiki/United_States_presidential_election_in_California,_2004" title="United States presidential election in California, 2004">California</a></td>
<td>55</td>
<td>5,509,826</td>
<td>44.36%</td>
<td>–</td>
<td>6,745,485</td>
<td>54.31%</td>
<td>55</td>
<td>20,714</td>
<td>0.17%</td>
<td>–</td>
<td>50,165</td>
<td>0.40%</td>
<td>–</td>
<td>26,645</td>
<td>0.21%</td>
<td>–</td>
<td>40,771</td>
<td>0.33%</td>
<td>–</td>
<td>27,747</td>
<td>0.22%</td>
<td>–</td>
<td>−1,235,659</td>
<td>−9.95%</td>
<td>12,421,353</td>
<td>CA
</td></tr>, <tr style="background:#B0CEFF">
<td style="text-align:left;"><a href="/wiki/United_States_presidential_election_in_Connecticut,_2004" title="United States presidential election in Connecticut, 2004">Connecticut</a></td>
<td>7</td>
<td>693,826</td>
<td>43.95%</td>
<td>–</td>
<td>857,488</td>
<td>54.31%</td>
<td>7</td>
<td>12,969</td>
<td>0.82%</td>
<td>–</td>
<td>3,367</td>
<td>0.21%<

In [707]:
state_names_gop = []
for row in state_2004_gop:
    state = row.find_all('td')[0].get_text().lstrip().strip()
    state_names_gop.append(set_printable(state))
print(len(state_names_gop))

state_names_dem = []
for row in state_2004_dem:
    state = row.find_all('td')[0].get_text().lstrip().strip()
    state_names_dem.append(set_printable(state))
print(len(state_names_dem))
state_names_all = state_names_gop + state_names_dem
print(len(state_names_all))
for name in state_names_all:
    print(name)

31
20
51
Alabama
Alaska
Arizona
Arkansas
Colorado
Florida
Georgia
Idaho
Indiana
Iowa
Kansas
Kentucky
Louisiana
Mississippi
Missouri
Montana
Nebraska
Nevada
New Mexico
North Carolina
North Dakota
Ohio
Oklahoma
South Carolina
South Dakota
Tennessee
Texas
Utah
Virginia
West Virginia
Wyoming
California
Connecticut
Delaware
District of Columbia
Hawaii
Illinois
Maine
Maryland
Massachusetts
Michigan
Minnesota
New Hampshire
New Jersey
New York
Oregon
Pennsylvania
Rhode Island
Vermont
Washington
Wisconsin


In [713]:
state_name_path = convert_list_for_path(state_names_all)
state_urls = []
driver = webdriver.Safari()
driver.get(url_2004)
for state_name in state_name_path:
    driver.find_element_by_xpath("//a[@title='United States presidential election in " + state_name + ", 2004']").click()
    time.sleep(1)
    state_urls.append(driver.current_url)
    print(driver.current_url)
    time.sleep(1)
    #link.click()
    driver.back()
print(len(state_urls))   

https://en.wikipedia.org/wiki/United_States_presidential_election_in_Alabama,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Alaska,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Arizona,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Arkansas,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Colorado,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Florida,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Georgia,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Idaho,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Indiana,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Iowa,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Kansas,_2004
https://en.wikipedia.org/wiki/United_States_presidential_election_in_Kentucky,_2004
https://

In [None]:
# Save all html by state
for i in range(len(state_urls)):
    state_name = state_names_all[i]
    content_state = requests.get(state_urls[i], headers = header)
    soup_state = BeautifulSoup(content_state.text, 'html.parser')
    fileName = state_name + '_2004.html'
    fout = open(fileName,'w')
    fout.write(soup_state.prettify())
    fout.close()
    print('Finish state ' + str(state_name))

In [853]:
def makeup_header_2004(state_name):
    if state_name == 'Indiana':
        return '2004 United States presidential election in Indiana'
    elif state_name == 'New Mexico':
        return 'United States presidential election in NM, 2004'
    elif state_name == 'North Dakota':
        return '2004 United States presidential election in ND'
    elif state_name == 'Oklahoma':
        return 'United States presidential election in Oklahoma, 2008'
    elif state_name == 'Rhode Island':
        return 'United States presidential election in RI, 2004'
    else:
        return 'United States presidential election in '+ state_name +', 2004'

def find_bench_table_2004(tables):
    if state_name == 'Indiana' or state_name == 'North Dakota':
        for item in tables:
            head = ''
            try:
                head = item.caption.get_text().lstrip().strip().split('[')[0]
            except:
                head = ''
            if head == makeup_header_2004(state_name):
                print('Table found')
                return item
    else:
        for item in tables:
            head = ''
            try:
                head = item.tbody.tr.th.get_text().lstrip().strip().split('[')[0]
            except:
                head = ''
            if head == makeup_header_2004(state_name):
                print('Table found')
                return item
    return 'Table not found'

def makeup_bench_mark_2004(bench_table):
    result = {}
    count = 2
    rows = bench_table.find_all('tr')
    while count < len(rows)-1:
        tds = rows[count].find_all('td')
        party = tds[0].get_text().lstrip().strip()
        name = tds[1].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def make_bench_south_dakota(soup_state, state_name):
    result = {}
    table = soup_state.find('table',{'border': 1})
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        name = tds[0].get_text().lstrip().strip()
        party = tds[2].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def make_bench_Nebraska(soup_state, state_name):
    result = {}
    table = soup_state.find_all('table',{'class':'wikitable'})[0]
    trs = table.find_all('tr')
    count = 1
    while count < len(trs)-1:
        tds = trs[count].find_all('td')
        name = tds[0].get_text().lstrip().strip()
        party = tds[2].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def make_bench_Vermont(soup_state, state_name):
    result = {}
    table = soup_state.find_all('table',{'border':1})[1]
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        name = tds[0].get_text().lstrip().strip()
        party = tds[1].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def make_bench_DC(soup_state, state_name):
    result = {}
    table = soup_state.find_all('table', {'border':1})[0]
    trs = table.find_all('tr')
    count = 2
    while count < len(trs)-2:
        tds = trs[count].find_all('td')
        name = tds[1].get_text().lstrip().strip()
        party = tds[0].get_text().lstrip().strip()
        result[name] = party
        count += 1
    return result

def make_bench_Indiana(soup_state, state_name):
    result = {'Bush':'Republican', 'Kerry':'Democratic', 'Badnarik': 'Libertarian'}
    '''table = soup_state.find_all('table', {'class': 'wikitable'})[0]
    trs = table.find_all('tr')
    count = 2
    while count < len(trs)-2:
        tds = trs[count].find_all('td')
        name = tds[2].get_text().lstrip().strip()
        party = tds[1].get_text().lstrip().strip()
        result[name] = party
        count += 1'''
    return result

In [757]:
def convert_string_to_number_2004(inputStr):
    if inputStr == "2,785'":
        return 2785
    if inputStr[-1] != '%':
        try:
            return int(inputStr.replace(',',''))
        except:
            return inputStr + '%'
    else:
        return inputStr

In [832]:
def operate_for_florida_2004(soup_state, state_name):
    state_result = []
    table = soup_state.find_all('table', {'class': 'wikitable sortable'})[0]
    col_names = ['Kerry%','Kerry#','Bush%','Bush#','Others%','Others#']
    trs = table.find_all('tr')
    count = 2
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

def operate_for_oklahoma_2004(soup_state, state_name):
    state_result = []
    table = soup_state.find_all('table', {'class': 'wikitable sortable'})[0]
    col_names = ['Bush#','Bush%','Kerry#','Kerry%','Others#','Others%']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        county_temp.append(0)
        county_temp.append('0%')
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

def operate_for_Michigan_2004(soup_state, state_name):
    state_result = []
    table = soup_state.find_all('table',{'class': 'wikitable sortable'})[0]
    col_names = ['Bush#','Bush%','Kerry#','Kerry%','Others#','Others%']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = [tds[0].get_text().lstrip().strip(),convert_string_to_number(tds[1].get_text().lstrip().strip()),'NA%',convert_string_to_number(tds[2].get_text().lstrip().strip()),'NA%',0,'NA%']
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result  

def operate_for_DC_2004(soup_state, state_name):
    state_result = []
    table = ''
    table = soup_state.find_all('table', {'border':1})[1]
    col_names = ['Kerry%','Kerry#','Bush%','Bush#','Nedar%','Nedar#','Cobb%','Cobb#','Badnarik%','Badnarik#','Harris%','Harris#']
    trs = table.find_all('tr')
    count = 1
    while count < len(trs):
        tds = trs[count].find_all('td')
        county_temp = []
        for i in range(len(tds)):
            if i == 0:
                county_temp.append(tds[i].get_text().lstrip().strip())
            else:
                county_temp.append(convert_string_to_number(tds[i].get_text().lstrip().strip()))
        state_result.append(makeup_state_list(state_name, county_temp, col_names))
        count += 1
    return state_result

In [854]:
national_info_2004 = []
for i in range(len(state_names_all)):
    state_name = state_names_all[i]
    print('Start state ' + state_name)
    if state_name == 'Alaska':
        print('Finished state Alaska\n')
        continue
    content_state = requests.get(state_urls[i], headers = header)
    soup_state = BeautifulSoup(content_state.text, 'html.parser')
    print('good for initialization')
    
    # Set up bench mark
    bench_mark = {}
    if state_name == 'South Dakota':
        bench_mark = make_bench_south_dakota(soup_state, state_name)
    elif state_name == 'Nebraska':
        bench_mark = make_bench_Nebraska(soup_state, state_name)
    elif state_name == 'Vermont':
        bench_mark = make_bench_Vermont(soup_state, state_name)
    elif state_name == 'District of Columbia':
        bench_mark = make_bench_DC(soup_state, state_name)
    elif state_name == 'Indiana':
        bench_mark = make_bench_Indiana(soup_state, state_name)
    else:
        tables = soup_state.find_all('table')
        bench_table = find_bench_table_2004(tables)
        bench_mark = makeup_bench_mark_2004(bench_table)
    print('good for setting bench mark')
    
    # Run main Program
    state_result = []
    if state_name == 'Nebraska':
        table = soup_state.find('table',{'border': 1})
    elif state_name == 'District of Columbia':
        table = []
    else:
        table = soup_state.find('table', {'class': 'wikitable sortable'})
        indicator, col_names = find_format(table) # find (indicator)1.total number of columns 2.the No. of 'Total' (col_names) list of candidates
    total_count = 0
    print('good for finding format')
    
    if state_name == 'Florida':
        state_result = operate_for_florida_2004(soup_state, state_name)
    elif state_name == 'Oklahoma' or state_name == 'Utah':
        state_result = operate_for_oklahoma_2004(soup_state, state_name)
    elif state_name == 'Michigan':
        state_result = operate_for_Michigan_2004(soup_state, state_name)
    elif state_name == 'District of Columbia':
        state_result = operate_for_DC_2004(soup_state, state_name)
    else:
        for row in table.find_all('tr'):
            if total_count != 0: # Skip the first row of table
                county_temp = []
                tds = row.find_all('td')
                td_count = 0
                while td_count < indicator['Cols']:
                    if td_count == 0:
                        county_temp.append(tds[td_count].get_text().lstrip().strip())
                        td_count += 1
                    elif td_count != indicator['Total']:
                        county_temp.append(convert_string_to_number_2004(tds[td_count].get_text().lstrip().strip().replace('\xa0','').replace(' ','')))
                        td_count += 1
                    else:
                        td_count += 1
                secure_col_name(col_names, county_temp)
                if (state_name == 'Arizona' or state_name == 'Georgia' or state_name == 'Iowa' 
                    or state_name == 'Nevada' or state_name == 'Virginia' or state_name == 'New Hampshire' 
                    or state_name == 'New Jersey' or state_name == 'Nebraska' or state_name == 'Vermont'):
                    col_names = ['Kerry%','Kerry#','Bush%','Bush#','Others%','Others#']
                if state_name == 'Indiana':
                    col_names = ['Bush%','Bush#','Kerry%','Kerry#','Others%','Others#']
                if state_name == 'Kansas' or state_name == 'Montana' or state_name == 'North Dakota' or state_name == 'West Virginia' or state_name == 'Wyoming':
                    col_names = ['Bush#','Bush%','Kerry#','Kerry%','Others#','Others%']
                if state_name == 'Mississippi':
                    col_names = ['Kerry#','Kerry%','Bush#','Bush%','Others#','Others%']
                state_result.append(makeup_state_list(state_name, county_temp, col_names))
                total_count += 1
            else:
                total_count += 1
    
    # convert state_result to list for further creation of data frame
    state_result_list = []
    count1 = 0
    #print(state_result[0])
    for county in state_result:
        state_temp = []
        state_county = county[0] + ' - ' + county[1]
        candidates_col, votes_col, pct_col = make_up_columns(county[2])
        party_col = makeup_party(candidates_col, bench_mark)
        state_temp = [state_county, candidates_col, votes_col, pct_col, party_col]
        state_result_list.append(state_temp)
        count1+=1
    print('good for setting up list')
    national_info_2004 = national_info_2004 + state_result_list
    print('Finished state ' + state_name + '\n')
print('Done')

Start state Alabama
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Alabama

Start state Alaska
Finished state Alaska

Start state Arizona
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Arizona

Start state Arkansas
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Arkansas

Start state Colorado
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Colorado

Start state Florida
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Florida

Start state Georgia
good for initialization
Table found
good for setting bench mark
good for finding format
good for setting up list
Finished state Georgia

Start state 

In [855]:
national_df_2004 = pd.DataFrame(national_info_2004)
national_df_2004 = national_df_2004.rename(columns = {0:'State-County', 1:'1st, 2nd, 3rd', 2:'votes1, votes2, votes3', 3:'pct1, pct2, pct3', 4:'party1, party2, party3'})
national_df_2004.head(500)

Unnamed: 0,State-County,"1st, 2nd, 3rd","votes1, votes2, votes3","pct1, pct2, pct3","party1, party2, party3"
0,Alabama - Autauga,Bush / Kerry / Others,15196 / 4758 / 127,75.7% / 23.7% / 0.6%,Republican / Democratic / Others
1,Alabama - Baldwin,Bush / Kerry / Others,52971 / 15599 / 750,76.4% / 22.5% / 1.1%,Republican / Democratic / Others
2,Alabama - Barbour,Bush / Kerry / Others,5899 / 4832 / 46,54.7% / 44.8% / 0.4%,Republican / Democratic / Others
3,Alabama - Bibb,Bush / Kerry / Others,5472 / 2089 / 39,72.0% / 27.5% / 0.5%,Republican / Democratic / Others
4,Alabama - Blount,Bush / Kerry / Others,17386 / 3938 / 180,80.9% / 18.3% / 0.8%,Republican / Democratic / Others
5,Alabama - Bullock,Kerry / Bush / Others,3210 / 1494 / 13,68.1% / 31.7% / 0.3%,Democratic / Republican / Others
6,Alabama - Butler,Bush / Kerry / Others,4979 / 3413 / 24,59.2% / 40.6% / 0.3%,Republican / Democratic / Others
7,Alabama - Calhoun,Bush / Kerry / Others,29814 / 15083 / 352,65.9% / 33.3% / 0.8%,Republican / Democratic / Others
8,Alabama - Chambers,Bush / Kerry / Others,7622 / 5347 / 63,58.5% / 41.0% / 0.5%,Republican / Democratic / Others
9,Alabama - Cherokee,Bush / Kerry / Others,5923 / 3040 / 86,65.5% / 33.6% / 1.0%,Republican / Democratic / Others


In [856]:
national_df_2004.to_csv('election_2004.csv')