#### QUESTION 1 Scrape the following website and store the data as json file(url = 'http://www.bu.edu/president/boston-university-facts-stats/').

In [1]:
import requests
from bs4 import BeautifulSoup
import json

url = 'http://www.bu.edu/president/boston-university-facts-stats/'

def scrape_bu_facts(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        data = {}

        # Extract relevant information from the HTML
        data['title'] = soup.title.text.strip()
        data['facts'] = []

        for fact_element in soup.find_all('div', class_='factoid'):
            fact = {}
            fact['category'] = fact_element.find('span', class_='category').text.strip()
            fact['value'] = fact_element.find('span', class_='value').text.strip()
            data['facts'].append(fact)

        return data
    else:
        print(f"Failed to fetch content. Status code: {response.status_code}")
        return None

def save_to_json(data, filename='bu_facts.json'):
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=2)
    print(f"Data saved to {filename}")

# Scrape the data
bu_data = scrape_bu_facts(url)

if bu_data:
    # Save the data to a JSON file
    save_to_json(bu_data)


Data saved to bu_facts.json


#### QUESTION 2: Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file


In [10]:
import pandas as pd

# URL of the page containing the table
url = 'https://archive.ics.uci.edu/ml/datasets.php'

# Read HTML table into a list of DataFrame objects
tables = pd.read_html(url)

uci_dataset_table = tables[0]

# Convert DataFrame to JSON
json_data = uci_dataset_table.to_json(orient='records', lines=True)

# Save JSON data to a file
with open('uci_dataset_table.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_data)

print("Data saved to uci_dataset_table.json")


HTTPError: HTTP Error 404: Not Found

#### QUESTION 3 Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). The table is not very structured and the scrapping may take very long time.

In [11]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'

# Send an HTTP request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table containing the list of presidents
    presidents_table = soup.find('table', {'class': 'wikitable'})

    # Extract data from the table
    presidents_data = []
    for row in presidents_table.find_all('tr')[1:]:  # Skip the header row
        columns = row.find_all(['th', 'td'])
        president_info = {
            'number': columns[0].text.strip(),
            'name': columns[1].text.strip(),
            'start_date': columns[3].text.strip(),
            'end_date': columns[4].text.strip() if len(columns) > 4 else ''
        }
        presidents_data.append(president_info)

    # Store the data as JSON
    with open('presidents_data.json', 'w', encoding='utf-8') as json_file:
        json.dump(presidents_data, json_file, ensure_ascii=False, indent=2)

    print('Data saved as presidents_data.json')
else:
    print(f'Failed to retrieve the page. Status code: {response.status_code}')


Data saved as presidents_data.json
