In [8]:
import pandas as pd
import numpy as np
import requests
import zipfile
import os
import time
from azure.storage.blob import BlobServiceClient
from bs4 import BeautifulSoup



In [2]:
headers = {'User-Agent': 'Student-Project-Research-v1.0'}
response = requests.get('https://s3.amazonaws.com/hubway-data/index.html', headers=headers)

In [27]:
# The base URL for the hubway data bucket
base_url = "https://s3.amazonaws.com/hubway-data/"
# The specific filename you want to download
file_name = "201512-hubway-tripdata.zip"
# Complete URL
full_url = base_url + file_name

# Best Practice: Identify your request with a User-Agent header
headers = {
    'User-Agent': 'Data-Analysis-Project-Research-v1.0'
}

print(f"Starting download for: {file_name}...")

try:
    # Send a GET request to the URL
    response = requests.get(full_url, headers=headers, stream=True)
    
    # Check if the request was successful (Status Code 200)
    if response.status_code == 200:
        # Open a local file with the same name in 'write binary' mode
        with open(file_name, 'wb') as f:
            for chunk in response.iter_content(chunk_size=128):
                f.write(chunk)
        print(f"Success! {file_name} has been downloaded.")
    else:
        print(f"Failed to download. Status code: {response.status_code}")

except Exception as e:
    print(f"An error occurred: {e}")

Starting download for: 201512-hubway-tripdata.zip...
Success! 201512-hubway-tripdata.zip has been downloaded.


In [6]:
import zipfile
import os

zip_file_path = "201501-hubway-tripdata.zip"
extract_to_path = "./extracted_data"

# Create directory if it doesn't exist
os.makedirs(extract_to_path, exist_ok=True)

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)
    print(f"Extracted files to: {extract_to_path}")

Extracted files to: ./extracted_data


In [7]:
import requests
from bs4 import BeautifulSoup

url = "https://s3.amazonaws.com/hubway-data/index.html"
response = requests.get(url)

In [10]:
soup = BeautifulSoup(response.text, 'html.parser')# prints well-formatted HTML
print(soup.prettify())

<html>
 <head>
  <!--

  Amazon S3 Bucket listing.


  Copyright (C) 2008 Francesco Pasqualini

      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published by
      the Free Software Foundation, either version 3 of the License, or
      (at your option) any later version.

      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.

      You should have received a copy of the GNU General Public License
      along with this program.  If not, see <http://www.gnu.org/licenses/>.

  -->
  <!--

  Modified by Nolan Lawson!  (http://nolanlawson.com).  I'm keeping the spirit of the
  GPL alive by issuing this with the same license!

  -->
  <title>
   Bucket loading...
  </title>
  <link href="//netdna.bootstra

In [11]:
soup.find('table')

<table class="hide-while-loading table table-striped">
<thead>
<tr>
<th>Name</th>
<th>Date Modified</th>
<th>Size</th>
<th>Type</th>
</tr>
</thead>
<tbody id="tbody-content">
</tbody>
</table>

In [41]:
table = soup.find('tbody')
print(table)

<tbody id="tbody-content">
</tbody>


In [42]:
rows = []

for tr in table.find_all("tr"):
    cells = [cell.get_text(strip=True) for cell in tr.find_all(["th", "td"])]
    if cells:
        rows.append(cells)

print(rows)


[]


In [44]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

driver = webdriver.Chrome()
driver.get("https://s3.amazonaws.com/hubway-data/index.html")

# wait until rows are loaded
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#tbody-content tr"))
)

soup = BeautifulSoup(driver.page_source, "html.parser")

tbody = soup.find("tbody", id="tbody-content")

for tr in tbody.find_all("tr"):
    cells = [td.get_text(strip=True) for td in tr.find_all("td")]
    print(cells)

driver.quit()


['201501-hubway-tripdata.zip', 'May 26th 2016, 09:22:57 am', '182 KB', 'ZIP file']
['201502-hubway-tripdata.zip', 'May 26th 2016, 09:22:58 am', '94 KB', 'ZIP file']
['201503-hubway-tripdata.zip', 'May 26th 2016, 09:22:59 am', '258 KB', 'ZIP file']
['201504-hubway-tripdata.zip', 'May 26th 2016, 09:23:00 am', '1.50 MB', 'ZIP file']
['201505-hubway-tripdata.zip', 'May 26th 2016, 09:23:01 am', '4.14 MB', 'ZIP file']
['201506-hubway-tripdata.zip', 'May 26th 2016, 09:23:02 am', '4.03 MB', 'ZIP file']
['201507-hubway-tripdata.zip', 'May 26th 2016, 09:23:06 am', '4.89 MB', 'ZIP file']
['201508-hubway-tripdata.zip', 'May 26th 2016, 09:23:07 am', '4.45 MB', 'ZIP file']
['201509-hubway-tripdata.zip', 'May 26th 2016, 09:23:09 am', '4.21 MB', 'ZIP file']
['201510-hubway-tripdata.zip', 'May 26th 2016, 09:23:10 am', '3.60 MB', 'ZIP file']
['201511-hubway-tripdata.zip', 'May 26th 2016, 09:23:12 am', '2.53 MB', 'ZIP file']
['201512-hubway-tripdata.zip', 'May 26th 2016, 09:23:13 am', '1.32 MB', 'ZIP fil