## Tutorial Note Book

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from requests.exceptions import RequestException
import time

# Function to extract specific words from a given URL
def extract_words_from_url(url):
    try:
        # Create a session object to persist settings across requests
        session = requests.Session()

        # TODO: Add retry logic to handle server errors (Hint: Use the Retry and HTTPAdapter classes)
        retry_strategy = Retry(
            total=3,
            backoff_factor=1,
            status_forcelist=[500, 502, 503, 504],
            allowed_methods=["GET"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("https://", adapter)
        session.mount("http://", adapter)

        # TODO: Set up headers to mimic a browser (Hint: User-Agent and Accept headers)
        headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
        }
        # Make the request to the URL (Hint: use session.get)
        response = session.get(url, headers=headers)
        # Make sure to raise an exception for any bad response
        response.raise_for_status()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Initialize an empty list to store extracted words
        words = []

        # TODO: Find all <span> elements with a specific class (Hint: use soup.find_all)

        # TODO: Extract price range
        price_range_element = soup.find('span', class_='y-css-1tsir1e')
        if price_range_element:
            price_range = price_range_element.text.strip()
            words.append(price_range)  

        # TODO: Extract review score
        review_score_element = soup.find('span', class_='y-css-1jz061g')
        if review_score_element:
            review_score = review_score_element.text.strip() 
            words.append(review_score)  

        # TODO: Extract category information
        category_elements = soup.find_all('span', class_='y-css-1jz061g')  
        categories = []  
        for span in category_elements:
            a_tag = span.find('a')  
            if a_tag:
                category = a_tag.text.strip()
                categories.append(category)  
        if categories:
            words.append(", ".join(categories))
        
        # TODO: Extract directions (address)
        directions_element = soup.find('p', class_='y-css-jbomhy')
        if directions_element:
            address = directions_element.text.strip()  
            words.append(address)  


        # TODO: Extract opening hours
        hours_table = soup.find('table', class_='hours-table__09f24__KR8wh')
        if hours_table:
            opening_hours = []
            rows = hours_table.find('tbody').find_all('tr')
            for row in rows:
                day_element = row.find('th')
                hours_element = row.find('td')
        
                if day_element and hours_element:  
                    day = day_element.text.strip()  
                    hours = hours_element.text.strip()  
                    opening_hours.append(f"{day}: {hours}")  

            if opening_hours:  
                words.append(", ".join(opening_hours))  

        # Return the first 5 words or fewer if less
        return words[:5]


    except RequestException as e:
        print(f"Error scraping {url}: {e}")  # Log the error message
        return [""] * 5  # Return a list of 5 empty strings if an error occurs

    finally:
        time.sleep(0)

def scrape_urls_from_csv(input_csv, output_csv):
    # Open the input CSV for reading and the output CSV for writing
    with open(input_csv, mode='r', newline='', encoding='utf-8') as infile, \
         open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:

        # TODO: Use csv.DictReader to read the CSV file and get a list of dictionaries, 
        # where each row is represented as a dictionary with column headers as keys.
        reader = csv.DictReader(infile)

        
        # Get the fieldnames (i.e., the column headers from the input CSV)
        fieldnames = reader.fieldnames

        # TODO: Use csv.DictWriter to write rows into the output CSV file, 
        # making sure to include the same fieldnames as the input CSV.
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)

        # Write the headers (column names) into the output CSV file
        writer.writeheader()

        # Iterate over each row in the input CSV
        for row in reader:
            # TODO: Extract the URL from the 'restaurant_url' column in the CSV.
            url = row.get('restaurant_url')
            # If the URL is missing, skip the row and print a message
            if not url:
                print(f"Skipping row with missing URL: {row}")
                continue

            # Print the URL being processed for debugging purposes
            print(f"Processing URL: {url}")

            # TODO: Call the extract_words_from_url function to get a list of words
            extracted_words = extract_words_from_url(url)

            # TODO: For each word in the extracted list, assign it to a new column (e.g., 'Label 1', 'Label 2', etc.)
            # Ensure the columns are named 'Label 1', 'Label 2', etc., and each word is assigned to the correct column.
            for i, word in enumerate(extracted_words):
                row[f'Label {i + 1}'] = word

            # Write the updated row into the output CSV
            writer.writerow(row)

            # TODO: Add a small delay between processing each URL to avoid overwhelming the server
            # Hint: A delay of 1 second could be more appropriate
            time.sleep(1)  # 1-second delay

if __name__ == "__main__":
    # TODO: Specify the input and output CSV file paths
    input_csv = '/Users/julesvalois/Downloads/URLs to scrape.csv'  
    output_csv = '/Users/julesvalois/Downloads/Scraped_results.csv'  

    # Call the main scraping function
    scrape_urls_from_csv(input_csv, output_csv)


### Key Details of the Code

**Session with Retry Logic**:  
This makes the code more robust by handling intermittent failures when accessing the URLs. It retries the request up to 5 times with increasing delays (exponential backoff).

**Browser-Like Headers**:  
The headers help disguise the script as a browser, preventing some websites from blocking it.

**HTML Parsing with BeautifulSoup**:  
The code searches for specific `<span>` elements with a defined class to locate words/keywords and extracts them.

**CSV Handling**:  
The input CSV is read row by row, URLs are processed, and extracted data is written back into a new CSV with additional columns (`Label 1` to `Label 5`).

**Error Handling**:  
If a request fails, the error is logged, and the script moves to the next URL, ensuring that an error doesn't stop the entire process.

**Delay Between Requests**:  
Although set to `0` for now, the `time.sleep(0)` can be adjusted to introduce a delay between requests to avoid overloading a server or getting IP blocked.
