### Proxy Checker

This section checks whether a proxy is being used and if it is working. It compares the IP address obtained with and without the proxy.

In [4]:
# Proxy Checker

import requests

url = "http://ident.me/"

proxies = ""  # Fill in your proxy details

#Check Proxy IP
try:
    proxy_ip = requests.get(url, proxies={"http": proxies, "https": proxies})
    proxy_ip.raise_for_status()
except requests.RequestException as e:
    print(f"Proxy check failed: {e}")
    proxy_ip = None

#Check for Real IP
try:
    real_ip = requests.get(url)
    real_ip.raise_for_status()
except requests.RequestException as e:
    print(f"Failed to get real IP: {e}")
    real_ip = None

if proxy_ip and real_ip:
    if proxy_ip.text == real_ip.text:
        print("Currently not using proxy")
    else:
        print("Proxy working, current IP:", proxy_ip.text)
else:
    print("Unable to determine proxy status")

Currently not using proxy


### Smithsonian Web Scraper

This cell scrapes data from the American History section of the Smithsonian website. It extracts the titles, images, and topics of items and saves the data into a CSV file.


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

data = []

base_url = "https://americanhistory.si.edu/collections/search?page="
pages = 1 # can be changed till max 1001 as only 1001 unique pages exist in this website

for i in range(pages):
    current_url = base_url + str(i)

    proxies = ""  # Update your proxy settings if needed

    try:
        page = requests.get(current_url, proxies=proxies)
        page.raise_for_status()  # Ensure that the request was successful
    except requests.RequestException as e:
        print(f"Failed to retrieve page {i}: {e}")
        continue

    soup = BeautifulSoup(page.text, "html.parser")

    links = soup.find_all("h3", class_="c-card__title")

    for link in links:
        item = {}
        title_tag = link.find("a")

        if title_tag:
            item['Title'] = title_tag.get_text()
            url_B = "https://americanhistory.si.edu" + title_tag.attrs.get("href", "")

            try:
                pageB = requests.get(url_B, proxies=proxies)
                pageB.raise_for_status()  # Ensure that the request was successful
            except requests.RequestException as e:
                print(f"Failed to retrieve details page {url_B}: {e}")
                continue

            soupB = BeautifulSoup(pageB.text, "html.parser")

            # Initialize the photo variable
            photo = None

            # Find the grandparent element with the specific required class
            photo_grandparent = soupB.find("div", class_="media-container media--no-openaccess has-ids type--slideshowhtml")

            if photo_grandparent:
                a_tag = photo_grandparent.find("a")
                if a_tag:
                    img_tag = a_tag.find("img")
                    if img_tag and "src" in img_tag.attrs:
                        photo = img_tag.attrs["src"]

            if photo:
                item['Picture'] = photo
            else:
                item['Picture'] = "Photo not found"

            # Find the p element with the specific required class
            topic_container = soupB.find("p", class_="edan-record__meta-detail freetext:topic")
            if topic_container:
                # Extract all of the "span" elements within the p element with required class
                topic_spans = topic_container.find_all("span", class_="edan-record__meta-content")
                topics = [span.get_text(strip=True) for span in topic_spans]
                item['Topic'] = topics
            else:
                item['Topic'] = "Topics were not found"

            data.append(item)
        else:
            print("No title found for a link")

df = pd.DataFrame(data)

#Saving data to CSV
df.to_csv("data.csv", index=False)
