In [1]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq playwright==1.46.0 --progress-bar off
!pip install -qqq html2text==2024.2.26 --progress-bar off
!pip install -qqq langchain-groq==0.1.9 --progress-bar off

In [2]:
!playwright install chromium

In [3]:
import re
from pprint import pprint
from typing import List, Optional
import os

import html2text
import nest_asyncio
import pandas as pd
from google.colab import userdata
from langchain_groq import ChatGroq
from playwright.async_api import async_playwright
from pydantic import BaseModel, Field
from tqdm import tqdm

nest_asyncio.apply()

## Fetch Web Content as Markdown

In [4]:
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.6668.90 Safari/537.36"

In [5]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch()

context = await browser.new_context(user_agent=USER_AGENT)

page = await context.new_page()
await page.goto("https://playwright.dev/")
content = await page.content()

await browser.close()
await playwright.stop()

In [6]:
print(content)

<!DOCTYPE html><html lang="en" dir="ltr" class="plugin-pages plugin-id-default" data-has-hydrated="false" data-theme="light" data-rh="lang,dir"><head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.5.2">
<title>Playwright</title><meta data-rh="true" name="twitter:card" content="summary_large_image"><link data-rh="true" rel="icon" href="/img/playwright-logo.svg"><link rel="search" type="application/opensearchdescription+xml" title="Playwright" href="/opensearch.xml">
<script src="/js/redirection.js"></script><link rel="stylesheet" href="/assets/css/styles.2cb9826f.css">
<script src="/assets/js/runtime~main.75118302.js" defer="defer"></script>
<script src="/assets/js/main.ea4099e6.js" defer="defer"></script>
<meta property="og:title" content="Playwright" data-rh="true"><meta name="viewport" content="width=device-width, initial-scale=1.0" data-rh="true"></head>
<body class="navigation-with-keyboard" data-rh="class">
<script>!function(){function t(t){document.document

In [7]:
markdown_converter = html2text.HTML2Text()
markdown_converter.ignore_links = False
markdown_content = markdown_converter.handle(content)

In [8]:
print(markdown_content)

Skip to main content

[![Playwright logo](/img/playwright-logo.svg)![Playwright
logo](/img/playwright-
logo.svg)**Playwright**](/)[Docs](/docs/intro)[API](/docs/api/class-
playwright)

Node.js

  * [Node.js](/)
  * [Python](/python/)
  * [Java](/java/)
  * [.NET](/dotnet/)

[Community](/community/welcome)

[](https://github.com/microsoft/playwright)[](https://aka.ms/playwright/discord)

Search

# Playwright enables reliable end-to-end testing for modern web apps.

[Get
started](/docs/intro)[Star](https://github.com/microsoft/playwright)[66k+](https://github.com/microsoft/playwright/stargazers)

  
  
  

![Browsers \(Chromium, Firefox, WebKit\)](img/logos/Browsers.png)

### Any browser • Any platform • One API

**Cross-browser.** Playwright supports all modern rendering engines including
Chromium, WebKit, and Firefox.

**Cross-platform.** Test on Windows, Linux, and macOS, locally or on CI,
headless or headed.

**Cross-language.** Use the Playwright API in
[TypeScript](https://playwrig

## LLM Setup

In [9]:
MODEL = "llama-3.1-70b-versatile"
os.environ["GROQ_API_KEY"] = "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"

llm = ChatGroq(temperature=0, model_name=MODEL, api_key=os.environ["GROQ_API_KEY"])


In [10]:
SYSTEM_PROMPT = """
You're an expert text extractor. You extract information from webpage content.
Always extract data without changing it and any other output.
"""


def create_scrape_prompt(page_content: str) -> str:
    return f"""
Extract the information from the following web page:
```
{page_content}
```
""".strip()

In [11]:
async def fetch_page(url, user_agent=USER_AGENT) -> str:
    playwright = await async_playwright().start()
    browser = await playwright.chromium.launch()

    context = await browser.new_context(user_agent=USER_AGENT)

    page = await context.new_page()
    await page.goto(url)
    content = await page.content()

    await browser.close()
    await playwright.stop()

    markdown_converter = html2text.HTML2Text()
    markdown_converter.ignore_links = False
    return markdown_converter.handle(content)

## Scrape Car Listings

In [12]:
url = "https://www.autoscout24.com/lst?atype=C&cy=D%2CA%2CB%2CE%2CF%2CI%2CL%2CNL&desc=0&fregfrom=2018&gear=M&powerfrom=309&powerto=478&powertype=hp&search_id=1tih4oks815&sort=standard&ustate=N%2CU"

In [13]:
auto_content = await fetch_page(url)

In [14]:
print(auto_content)

Skip to main content

AutoScout24 is currently only available to a limited extent due to maintenance
work. This affects some functions such as contacting salespeople, logging in
or managing your vehicles for sale.

[ ![auto24-logo](/assets/as24-header-footer/as24-horizontal-
inverse.d34ff335.svg) ](https://www.autoscout24.com/ "AutoScout24 - Used and
new cars")

[ ](https://www.autoscout24.com/favorites)

  * [ Used and New Cars ](https://www.autoscout24.com/)
  * [ Motorbikes ](https://www.autoscout24.com/motorcycle/)

  * [ ](https://www.autoscout24.com/favorites)
  * English 

[ ![](/assets/as24-header-footer/flag-de.013a09fe.svg) Deutschland
](https://www.autoscout24.de/) [ ![](/assets/as24-header-footer/flag-
it.5021f5d3.svg) Italia ](https://www.autoscout24.it/) [
![](/assets/as24-header-footer/flag-at.6eb37b36.svg) Österreich
](https://www.autoscout24.at/) [ ![](/assets/as24-header-footer/flag-
nl.843ae853.svg) Nederland  ](https://www.autoscout24.nl/) [
![](/assets/as24-header-

In [15]:
class CarListing(BaseModel):
    """Information about a car listing"""

    make: str = Field("Make of the car e.g. Toyota")
    model: str = Field("Model of the car, maximum 3 words e.g. Land Cruiser")
    horsepower: int = Field("Horsepower of the engine e.g. 231")
    price: int = Field("Price in euro e.g. 34000")
    mileage: Optional[int] = Field("Number of kilometers on the odometer e.g. 73400")
    year: Optional[int] = Field("Year of registration (if available) e.g. 2015")
    url: str = Field(
        "Url to the listing e.g. https://www.autoscout24.com/offers/lexus-rc-f-advantage-coupe-gasoline-grey-19484ec1-ee56-4bfd-8769-054f03515792"
    )


class CarListings(BaseModel):
    """List of car listings"""

    cars: List[CarListing] = Field("List of cars for sale.")

In [16]:
car_listing_scraper_llm = llm.with_structured_output(CarListings)

In [17]:
extraction = car_listing_scraper_llm.invoke(
    [("system", SYSTEM_PROMPT), ("user", create_scrape_prompt(auto_content))]
)

In [18]:
extraction.cars

[CarListing(make='Dacia', model='SANDERO STREETWAY 1.0 TCE ECO-G COMFORT SL DACIA', horsepower=540, price=13900, mileage=42278, year=2021, url='/offers/dacia-others-sandero-streetway-1-0-tce-eco-g-comfort-sl-dacia-lpg-red-ddc51fca-5769-44c4-9439-9cb16bc92fcc'),
 CarListing(make='Porsche', model='718 Spyder PCCB/Bose/Full PPF/Cupseats/PDLS+/Manual', horsepower=420, price=107900, mileage=19500, year=2020, url='/offers/porsche-718-spyder-pccb-bose-full-ppf-cupseats-pdls-manual-gasoline-blue-364a3ceb-bb85-44bd-bc2b-115cedacff18'),
 CarListing(make='Tesla', model='Model S Long Range *FSD* *21Alu*', horsepower=562, price=39820, mileage=61600, year=2019, url='/offers/tesla-model-s-long-range-fsd-21alu-electric-white-4d4a1be4-7e9b-40d6-9fb9-1b1aae56e381'),
 CarListing(make='BMW', model='M3 M3 M Schalensitze Carbon/LED/LIVE COCKPIT/M Drive', horsepower=480, price=68950, mileage=30000, year=2021, url='/offers/bmw-m3-m3-m-schalensitze-carbon-led-live-cockpit-m-drive-gasoline-grey-1f5c7725-2967-45

In [19]:
def filter_model(row):
    row = re.sub("[^0-9a-zA-Z]+", " ", row)
    parts = row.split(" ")
    return " ".join(parts[:3])


rows = [listing.__dict__ for listing in extraction.cars]

listings_df = pd.DataFrame(rows)
listings_df["model"] = listings_df.model.apply(filter_model)
listings_df

Unnamed: 0,make,model,horsepower,price,mileage,year,url
0,Dacia,SANDERO STREETWAY 1,540,13900,42278.0,2021.0,/offers/dacia-others-sandero-streetway-1-0-tce...
1,Porsche,718 Spyder PCCB,420,107900,19500.0,2020.0,/offers/porsche-718-spyder-pccb-bose-full-ppf-...
2,Tesla,Model S Long,562,39820,61600.0,2019.0,/offers/tesla-model-s-long-range-fsd-21alu-ele...
3,BMW,M3 M3 M,480,68950,30000.0,2021.0,/offers/bmw-m3-m3-m-schalensitze-carbon-led-li...
4,Ford,Mustang Fastback 5,446,51980,,,/offers/ford-mustang-fastback-5-0-v8-gt-4-99-f...
5,BMW,M3 Competition M,510,88980,6811.0,2023.0,/offers/bmw-m3-competition-m-xdrive-touring-se...
6,BMW,M3 Limousine M,480,69700,33062.0,2021.0,/offers/bmw-m3-limousine-m-drivers-pack-carbon...
7,Dodge,Challenger 6 4,492,42487,35320.0,2019.0,/offers/dodge-challenger-6-4-r-t-scat-pack-bli...
8,Porsche,991 911 GT3,500,178911,9000.0,2018.0,/offers/porsche-991-911-gt3-touring-gasoline-r...
9,Porsche,911 GT3 991,500,220000,10758.0,2018.0,/offers/porsche-911-gt3-991-2-handschalter-gas...


In [20]:
listings_df.to_csv("car-listings.csv", index=None)