In [10]:
#!/usr/bin/env python

'''
GA Data Science Q2 2016

Code walk-through 4: Web scraping using BeautifulSoup
'''

import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup

BREXIT_URL = 'https://ig.ft.com/sites/brexit-polling/'

# Read HTML
html = requests.get(BREXIT_URL).text

# Parse HTML into a BeautifulSoup object
soup = BeautifulSoup(html)

# Extract first (and only) table, and pass it to pandas for parsing
brexit = pd.read_html(soup.table.decode())[0]

# Alternatively, pandas can parse the entire document and extract all tables it
# finds directly (hence the indexing to extract the first element)
brexit = pd.read_html(html)[0]

# Rename columns
brexit.columns = ['stay', 'leave', 'undecided', 'date', 'pollster', 'n']

# Convert 'date' to `datetime`
brexit.date = pd.to_datetime(brexit.date)

# Convert 'n' to `int`
brexit.n = pd.to_numeric(brexit.n, errors='coerce')

# Homogenise pollster names
brexit.pollster = brexit.pollster.replace({
    'ComRes/Sunday Mirror/Independent': 'ComRes',
    'ICM/The Guardian': 'ICM',
    'ICM/The Telegraph': 'ICM',
    'Opinium/Observer': 'Opinium',
    'Populus/The Times': 'Populus',
    'Survation/Mail on Sunday': 'Survation',
    'TNS': 'TNS BMRB',
    'TNS-BMRB': 'TNS BMRB',
    'YouGov/Sunday Times': 'YouGov',
    'YouGov/The Sun': 'YouGov',
    'YouGov/The Times': 'YouGov'
})

# Select only polls from 2015 onwards
brexit = brexit[brexit.date >= '2015-01-01']

# Keep only data from pollster with >= 10 polls
pollster_n = brexit.pollster.value_counts()
brexit = brexit[brexit.pollster.isin(pollster_n[pollster_n >= 10].index)]

# Convert percentages to counts
brexit[['stay', 'leave', 'undecided']] =\
    brexit[['stay', 'leave', 'undecided']].multiply(brexit.n / 100, axis='index')

# Group by 'date' and 'pollster'
brexit = brexit.groupby(['date', 'pollster'], as_index=False).sum()

# Convert counts back to percentages
brexit[['stay', 'leave', 'undecided']] =\
    brexit[['stay', 'leave', 'undecided']].divide(brexit.n, axis='index')

# Save the data to disk
path_d = r'D:\Users\SHILAU\Desktop\Data Science Bootcamp\Data'
brexit.to_csv(os.path.join(path_d, 'brexit.csv'))




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
