In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
from zipfile import ZipFile
from io import BytesIO

import requests
import pandas as pd
import re

In [2]:
#define 'sourceURL' as the target page that contains the list of zipped csv files
sourceURL = "https://s3.amazonaws.com/capitalbikeshare-data/index.html"

#call chrome webdriver from a local source as 'driver'
#requires chrome driver from https://sites.google.com/a/chromium.org/chromedriver/downloads
#Must change path to location of your chrome driver
driver = webdriver.Chrome('/Users/cheese/lData/chromedriver')  

#use webdriver to call the URL
#Target URL launches in Chrome browser controlled by automated test software
driver.get(sourceURL)

#use WebDriverWait to make sure the page loads before moving on, check by table visibility
wait = WebDriverWait(driver, 10)
element = wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "table")))

#pass page to beautiful soupe as 'soup'
soup = BeautifulSoup(driver.page_source, 'lxml')

#scrape links from beautiful soup in 'links'
links = []

for link in soup.findAll('a', attrs={'href': re.compile("https://")}):
        links.append(link.get('href'))
        
#confirm by viewing all entires in the 'link' array
print(links)

['https://s3.amazonaws.com/capitalbikeshare-data/2010-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2011-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2012-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2013-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2014-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2015-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2016-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/2017-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/201801-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/201802-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/201803-capitalbikeshare-tripdata.zip', 'https://s3.amazonaws.com/capitalbikeshare-data/201804-cap

In [3]:
df = pd.DataFrame() #create a new dataframe as 'df' that will contain all rows from all csv extractions

for fileURL in links: #iterate over each stored URL in the 'link' array
    if fileURL.endswith('.zip'): #Only process URLs ending in .zip to skip scraped links such as index.html
        content = requests.get(fileURL) #use requests.get to store each URL's content as 'content'
        zf = ZipFile(BytesIO(content.content)) #use ZipFile to open and read zip data directly with BytesIO
    #Use the zip file name to identify only the csv files for extraction that match the zip file name   
        for item in zf.namelist():
            print("File in zip: "+ item)
           
            #fileMatch = [s for s in zf.namelist() if ".csv" in s][0]
            fileMatch = [s for s in zf.namelist()][0]   
            
            #for the current iteration where the csv file name matches, extract csv from zip, and read into a dataframe 'df_n'
            df_n = pd.read_csv(zf.open(fileMatch), low_memory=False) #df_n will be overwritten with each iteration
            df = df.append(df_n) #Append each new iteration's dataframe from 'df_n' to 'df'
#all file names should display in the order that they are processed

File in zip: 2010-capitalbikeshare-tripdata.csv
File in zip: 2011-capitalbikeshare-tripdata.csv
File in zip: 2012Q1-capitalbikeshare-tripdata.csv
File in zip: 2012Q2-capitalbikeshare-tripdata.csv
File in zip: 2012Q3-capitalbikeshare-tripdata.csv
File in zip: 2012Q4-capitalbikeshare-tripdata.csv
File in zip: 2013Q1-capitalbikeshare-tripdata.csv
File in zip: 2013Q2-capitalbikeshare-tripdata.csv
File in zip: 2013Q3-capitalbikeshare-tripdata.csv
File in zip: 2013Q4-capitalbikeshare-tripdata.csv
File in zip: 2014Q1-capitalbikeshare-tripdata.csv
File in zip: 2014Q2-capitalbikeshare-tripdata.csv
File in zip: 2014Q3-capitalbikeshare-tripdata.csv
File in zip: 2014Q4-capitalbikeshare-tripdata.csv
File in zip: 2015Q1-capitalbikeshare-tripdata.csv
File in zip: 2015Q2-capitalbikeshare-tripdata.csv
File in zip: 2015Q3-capitalbikeshare-tripdata.csv
File in zip: 2015Q4-capitalbikeshare-tripdata.csv
File in zip: 2016Q1-capitalbikeshare-tripdata.csv
File in zip: 2016Q2-capitalbikeshare-tripdata.csv
File

In [4]:
#print the dataframe 'df' that contains all data from Cap Bike Share
df

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,1012,2010-09-20 11:27:04,2010-09-20 11:43:56,31208,M St & New Jersey Ave SE,31108,4th & M St SW,W00742,Member
1,61,2010-09-20 11:41:22,2010-09-20 11:42:23,31209,1st & N St SE,31209,1st & N St SE,W00032,Member
2,2690,2010-09-20 12:05:37,2010-09-20 12:50:27,31600,5th & K St NW,31100,19th St & Pennsylvania Ave NW,W00993,Member
3,1406,2010-09-20 12:06:05,2010-09-20 12:29:32,31600,5th & K St NW,31602,Park Rd & Holmead Pl NW,W00344,Member
4,1413,2010-09-20 12:10:43,2010-09-20 12:34:17,31100,19th St & Pennsylvania Ave NW,31201,15th & P St NW,W00883,Member
5,982,2010-09-20 12:14:27,2010-09-20 12:30:50,31109,7th & T St NW,31200,Massachusetts Ave & Dupont Circle NW,W00850,Member
6,930,2010-09-20 12:15:26,2010-09-20 12:30:56,31109,7th & T St NW,31105,14th & Harvard St NW,W00804,Member
7,1659,2010-09-20 12:16:36,2010-09-20 12:44:15,31111,10th & U St NW,31208,M St & New Jersey Ave SE,W01084,Member
8,2496,2010-09-20 12:18:38,2010-09-20 13:00:15,31600,5th & K St NW,31601,19th & East Capitol St SE,W00812,Member
9,1487,2010-09-20 12:19:46,2010-09-20 12:44:34,31703,Minnesota Ave Metro/DOES,31603,1st & M St NE,W00803,Member
