In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import concurrent.futures

In [101]:
#Scraping all the features
def get_all_specs(soup:BeautifulSoup) -> str:
    return soup.find("div", attrs = {"id": "specs"})

#Putting all the features into a dictionary
def get_all_features(soup: BeautifulSoup) -> dict:
    features = {}    
    all_specs = get_all_specs(soup).find_all("ul") if get_all_specs(soup) else "No specs"
    if all_specs:
        for ul_element in all_specs:
            list_items = ul_element.find_all("li")  

            # val_check = check.find("span", attrs )    
            for li in list_items:
                
                value_element = li.find("span", attrs = {"class": "list__value"})
                key_element = li.find("span", attrs = {"class": "list__key"})
                
                
                check = li.find("span", attrs = {"class": "list__value list__value--large"}) if li.find("span", attrs = {"class": "list__value list__value--large"}) else None
                if check:
                    features["Original price"] = check.text.strip()
                    features["TAXES PAID"] = li.find("span", attrs = {"class": "list__key"}).text.strip() if li.find("span", attrs = {"class": "list__key"}) else None
                    continue
                if key_element is None:
                    value = value_element.text.strip()
                    features["Price"] = value
                    
                    
                if key_element and value_element:
                    key = key_element.text.strip()
                    value = value_element.text.strip()
                    features[key] = value
            
    features["Location"] = get_location(soup)
    features["Description"] = get_description(soup)
    features["Categories"] = get_categories(soup)
    features.update(get_equipment(soup)) if get_equipment(soup) else None
    features["VAT"] = get_vat(soup)
    features["Views, favourites, ID and listings"] = get_id(soup)
    return features
            
        

#Getting the selling location of the boat
def get_location(soup: BeautifulSoup) -> str:
    temp = soup.find("div", attrs = {"id": "location"}) if soup.find("div", attrs = {"id": "location"}) else "No location"
    return temp.find("p", attrs = {"class": "text"}).text.strip()
    

#Each page has a list of boats, this function extracts the URL of each boat
def get_individual_URL(soup: BeautifulSoup) -> list:
    l = []
    links = soup.find_all("h3", attrs = {"class": "blurb__title"})
    for link in links:
        l.append(link.find("a")["href"])
        
    return l

#Extracting the preview image of the boat
def get_image(soup: BeautifulSoup) -> str:
    pictures =  soup.find_all("picture")
    return pictures[0].find("img")["src"] if pictures else "No image"

#Extracting all the images of the boat
def all_images(soup: BeautifulSoup) -> list:
    pictures =  soup.find_all("picture")
    l = []
    for picture in pictures:
        imgs = picture.find_all("img")
        for img in imgs:
            # Access the 'src' attribute of each img
            img_src = img.get("src")  # Use .get() for safer access
            # Check if the src attribute ends with '.jpg' and is not None
            if img_src and img_src.endswith(".jpg"):
                l.append(img_src)
    
    
    #getting all remaining pictures        
    k = 3
    
    while True:      
        img_element = soup.find("li", attrs={"data-image-id": f"image{k}"})
        if not img_element:
            break 
        img = img_element.find("img")
        if img:
            srcset = img.get('srcset') or img.get('data-srcset')
            if srcset:
                links = [link.strip().split(' ')[0] for link in srcset.split(',')]
                if len(links) >= 2:
                    l.append(links[1])
        k += 1  
    return l if pictures else "No images"



#Extracting the description of the boat
def get_description(soup: BeautifulSoup) -> str:
    return soup.find("div", attrs = {"class": "content"}).text.strip() if soup.find("div", attrs = {"class": "content"}) else "No description"

#Getting the categories

def get_categories(soup: BeautifulSoup) -> list:
    temp = soup.find("div", attrs = {"id": "specs"})
    for t in temp:
        if t.find("p"):
            return t.find("p").text.split(",")
    return None

#Get likes, favourites, ID and listing date
def get_id(soup: BeautifulSoup) -> list:
    id = []
    l = soup.find_all("li") if soup.find_all("li") else "No ID"
    for k in l:
        if k.find("strong"):
            id.append(k.find("strong").text)

    return id    
            


#Get the equipment that is included with the boat

def get_equipment(soup: BeautifulSoup) -> dict:
    temp = soup.find("div", attrs={"id": "equipment"})
    all_equipment = temp.find_all("li") if temp else "No equipment"
    equipment = {k.text: 1 for k in all_equipment} if temp else None
    return equipment 
    

#Get the VAT status of the boat
def get_vat(soup: BeautifulSoup) -> str:
    vat = soup.find("span", attrs = {"class": "list__key"}).text
    return vat if vat else "No VAT"


def find_id(soup: BeautifulSoup) -> str:
    all = soup.find("ul", attrs = {"class": "list list--space-8 l-mb-32"})
    for a in all:
        if a.find("strong"):
            return a.find("strong").text


In [98]:
HEADERS = ({'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Accept-Language": "en-UK, en;q=0.5"})
URL = f"https://boat24.com/uk/sailingboats/beneteau/beneteau-57/detail/564009/"
webpage = requests.get(URL, headers = HEADERS)

soup = BeautifulSoup(webpage.content, "html.parser")



# NO MULTITHREADING


In [131]:
HEADERS = ({'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Accept-Language": "en-UK, en;q=0.5"})
k = 1
def scrape(i: int, all_descriptors:dict, images:dict):
    try:
        print(i)
        global k
        
        # TIMES 20 BECAUSE OF PAGE NUMBER
        URL =f"https://www.boat24.com/uk/secondhandboats/?page={i*20}"
        webpage = requests.get(URL, headers=HEADERS)
        soup = BeautifulSoup(webpage.content, "html.parser")
        
        links = get_individual_URL(soup)
        
        for link in links:
            
            URL = link
            webpage = requests.get(URL, headers=HEADERS)
            soup = BeautifulSoup(webpage.content, "html.parser")
            specs = get_all_features(soup)
            if specs is not None:
                all_descriptors[f"Boat_{k}"] = specs
                all_descriptors[f"Boat_{k}"]["URL"] = URL
                images[f"Boat_{k}"] = all_images(soup)
                # print(k)
                k += 1
    except Exception as e:
        print(f"Error scraping boat {i}: {e}")
            
            
            
def scrape_images(i: int, images: dict):
    print(i)
    # global k
    URL =f"https://www.boat24.com/uk/secondhandboats/?page={i*20}"
    webpage = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")
    print(soup)
    links = get_individual_URL(soup)
    # print(links)
    for link in links:
        URL = link
        webpage = requests.get(URL, headers=HEADERS)
        soup = BeautifulSoup(webpage.content, "html.parser")
        key = find_id(soup)
        get_all_images = all_images(soup)
        if key not in images:
            images[key] = get_all_images
        #     k += 1
        # k += 1

# SCRAPING WITH MULTITHREADING

In [None]:
HEADERS = ({'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Accept-Language": "en-UK, en;q=0.5"})

all_descriptors = {}
images = {}
k = 1


num_pages = 1295
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    tasks = []
    for i in range(num_pages):
        tasks.append(executor.submit(scrape, i, all_descriptors, images))

    for future in concurrent.futures.as_completed(tasks):
        future.result()
    
        
        

In [136]:
df_lists = pd.DataFrame.from_dict(images, orient='index')

# Optionally, reset the index if you want the keys in a separate column instead of as row indices
df_lists_reset = df_lists.reset_index()

# Rename columns as needed
df_lists_reset.columns = ['Key'] + [f'image {i}' for i in range(1, len(df_lists_reset.columns))]

In [142]:
keep = df_lists_reset[["Key", "image 1", "image 2", "image 3", "image 4", "image 5", "image 6", "image 7", "image 8", "image 9", "image 10", "image 11", "image 12", "image 13"]]

In [152]:
df = pd.DataFrame(all_descriptors).T

In [151]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'ID'}, inplace=True)

In [146]:
merged_df = df.merge(keep, left_on="ID", right_on="Key")


In [150]:
merge_path = "../Data/For_henry/full_data_27k.csv"
merged_df.to_csv(merge_path, index=False)