#

# Exercise: Web scrape flexcar.gr

Web scrape [flexcar.gr](https://www.google.com/url?q=http://flexcar.gr&sa=D&source=editors&ust=1747567629672978&usg=AOvVaw0IAn-00O3YnivHyDGHvSJo)  
Get the features (brand, model, price, hp, gearbox, extras..) for all leasing car offers.

In [1]:
base_url = 'https://flexcar.gr'
filtered_url = base_url + '/cars/?category=MIN&category=HAT&category=SED&category=SUV&category=LUXy&lang=en'

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pandas.core.dtypes.common import is_hashable
import re
import time
import random

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36"
}


response = requests.get(filtered_url, headers=headers)

In [4]:
soup = BeautifulSoup(response.content)

In [5]:
soup.find_all('div', class_='itemContainer')[0].find_all('a')[1]['href']
item_containers = soup.find_all('div', class_='itemContainer')

In [6]:
#  extra links from 1 page
page_links = [item_containers[i].find_all('a')[1]['href'] for i in range(len(item_containers))]

In [7]:
page_links

['/cars/opel/opel-astra-gs-line-phev/?lang=en',
 '/cars/vw/vw-polo-life/?lang=en',
 '/cars/vw/vw-t-roc-life/?lang=en',
 '/cars/renault/renault-arkana-hybrid-petrol-e-tech-techno/?lang=en',
 '/cars/smart/smart-for-two-electric-pulse/?lang=en',
 '/cars/audi/audi-a3-petrol-30-comfort/?lang=en',
 '/cars/dacia/dacia-duster-prestige-4x4/?lang=en',
 '/cars/volvo/volvo-xc-40-electric-recharge-core/?lang=en',
 '/cars/skoda/skoda-enyaq-iv-80/?lang=en',
 '/cars/peugeot/peugeot-3008-petrol-active-plus-3/?lang=en',
 '/cars/vw/vw-golf-life-e-tsi-dsg/?lang=en',
 '/cars/skoda/skoda-fabia-petrol-ambition/?lang=en',
 '/cars/peugeot/peugeot-3008-diesel-active-plus-1/?lang=en',
 '/cars/hyundai/hyundai-tucson-distinctive/?lang=en',
 '/cars/mercedes/mercedes-glc-petrol-200-coupe/?lang=en',
 '/cars/mercedes/mercedes-glc-coupe-200d-4-matic-coupe/?lang=en',
 '/cars/renault/renault-clio-petrol-expression/?lang=en',
 '/cars/range-rover/range-rover-evoque-mild-hybrid-petrol-r-dynamic-s/?lang=en',
 '/cars/zhidou/z

In [8]:
def get_car_links(soup: BeautifulSoup) -> list[str]:
    item_containers = soup.find_all('div', class_='itemContainer')
    return [item_containers[i].find_all('a')[1]['href'] for i in range(len(item_containers))]

In [9]:
response = requests.get(base_url + page_links[0], headers=headers)
soup  = BeautifulSoup(response.content)

In [10]:
response = requests.get('https://flexcar.gr/cars/opel/opel-astra-gs-line-phev/?lang=en', headers=headers)
soup  = BeautifulSoup(response.content)

In [11]:
# car-basic info
soup.find('div', class_='carChars')

<div class="carChars d-flex justify-content-between mt20 mb20 flex-wrap" data-v-46b0d998="" data-v-bce8aa42=""><div class="col3 col6-sm d-flex flex-column align-items-center text-center" data-v-bce8aa42=""><img alt="hp.svg" class="mb10" data-v-bce8aa42="" loading="lazy" src="https://d1icjjohiliu1c.cloudfront.net/hp_77f3714f49.svg"/> <span class="dark f500 pxs" data-v-bce8aa42="">180 hp</span></div> <div class="col3 col6-sm d-flex flex-column align-items-center text-center" data-v-bce8aa42=""><img alt="cc.svg" class="mb10" data-v-bce8aa42="" loading="lazy" src="https://d1icjjohiliu1c.cloudfront.net/cc_aa34a8596d.svg"/> <span class="dark f500 pxs" data-v-bce8aa42="">1600 cc</span></div> <div class="col3 col6-sm d-flex flex-column align-items-center text-center" data-v-bce8aa42=""><img alt="kwh.svg" class="mb10" data-v-bce8aa42="" loading="lazy" src="https://d1icjjohiliu1c.cloudfront.net/kwh_0ffed9c32d.svg"/> <span class="dark f500 pxs" data-v-bce8aa42="">1.2 liters / 100 km</span></div> 

In [12]:
# extract basic info
[entry.text for entry in soup.find('div', class_='carChars').find_all('span')]

['180 hp', '1600 cc', '1.2 liters / 100 km', 'petrol | automatic']

In [13]:
def get_hp(soup: BeautifulSoup):
    return (
        soup.find("div", class_="carChars")
        .find_all("span")[0]
        .text.split()[0]
    )


def get_engine_size(soup: BeautifulSoup):
    return (
        soup.find("div", class_="carChars")
        .find_all("span")[1]
        .text.split()[0]
    )


def get_consumption(soup: BeautifulSoup):
    return (
        soup.find("div", class_="carChars")
        .find_all("span")[2]
        .text.split()[0]
    )

def get_capacity(soup: BeautifulSoup):
    return (
        soup.find("div", class_="carChars")
        .find_all("span")[1]
        .text.split()[0]
    )


def get_autonomy(soup: BeautifulSoup):
    return (
        soup.find("div", class_="carChars")
        .find_all("span")[2]
        .text.split()[0]
    )


def get_fuel_type(soup: BeautifulSoup):
    try:
        fuel_type = (
            soup.find("div", class_="carChars")
            .find_all("span")[3]
            .text.split()[0]
        )
    except IndexError:
        return None
    return fuel_type


def get_gearbox(soup: BeautifulSoup):
    try:
        gearbox = (
            soup.find("div", class_="carChars")
            .find_all("span")[3]
            .text.split()[2]
        )
    except IndexError:
        return None
    return gearbox


get_hp(soup), get_consumption(soup), get_engine_size(
    soup
), get_fuel_type(soup), get_gearbox(soup)

('180', '1.2', '1600', 'petrol', 'automatic')

In [14]:
# car highlights
soup.find('div', class_='carHighlights')

<div class="carHighlights d-flex justify-content-between mt20 mb20 flex-wrap" data-v-46b0d998=""><h3 class="f700 dark h5" data-v-46b0d998="">Highlights</h3> <ul data-v-46b0d998=""><li data-v-46b0d998=""><span class="dark f500 p" data-v-46b0d998="">Cruise control</span></li><li data-v-46b0d998=""><span class="dark f500 p" data-v-46b0d998="">Apple car play</span></li><li data-v-46b0d998=""><span class="dark f500 p" data-v-46b0d998="">Android auto</span></li><li data-v-46b0d998=""><span class="dark f500 p" data-v-46b0d998="">Park assist camera</span></li><li data-v-46b0d998=""><span class="dark f500 p" data-v-46b0d998="">Park assist sensors</span></li></ul></div>

In [15]:
# extract car highlights
[entry.text for entry in soup.find('div', class_='carHighlights').find_all('span')]

['Cruise control',
 'Apple car play',
 'Android\xa0auto',
 'Park assist camera',
 'Park assist sensors']

In [16]:
def _clean_list(lst: list[str]):
    return list(map(lambda x: x.replace('\xa0', ' '), lst))

In [17]:
def get_highlights(soup: BeautifulSoup) -> list[str]:
    highlights = soup.find('div', class_='carHighlights')
    if highlights:
        lst = [entry.text for entry in highlights.find_all('span')]
    else: 
        lst = []
    return _clean_list(lst)

In [18]:
# extract Name and model
name_model = soup.find('h1').text.split(' ')
name = name_model[0]
model = ' '.join(name_model[1:])
name, model

('Opel', 'Astra Plug In Hybrid Petrol (GS Line)')

In [19]:
def get_car_brand(soup: BeautifulSoup):
    return soup.find('h1').text.split(' ')[0]
def get_car_model(soup: BeautifulSoup):
    return soup.find('h1').text.split(' ')[1]
def get_model_version(soup: BeautifulSoup):
    return ' '.join(soup.find('h1').text.split(' ')[2:])

In [20]:
# Extract price
price_string = soup.find('h1').find_next_sibling().find('span', class_='f400').getText(strip=True)

price_string = re.search('\d+', price_string).group(0)
price_string

'680'

In [21]:
def get_price(soup:  BeautifulSoup)  -> str:
    price_string = soup.find('h1').find_next_sibling().find('span', class_='f400').getText(strip=True)
    return re.search(r'[\d\.]+', price_string).group(0).replace('.', '')

In [22]:
# spec list
soup.find('div', class_="el-tab-pane")



In [23]:
[entry.text for entry in soup.find('div', class_="el-tab-pane").find_all('li')][:5]

['Automatic air conditioning',
 'Front / rear power windows',
 'Electrically adjustable / heated door mirrors',
 'Auto-dimming rear view mirror',
 'Rain sensor']

In [24]:
def get_specs(soup: BeautifulSoup) -> list[str]:
    lst = [entry.text for entry in soup.find('div', class_="el-tab-pane").find_all('li')]
    return _clean_list(lst)

In [25]:
def _clean_spaces(string: str):
    string.replace("\xa0", " ")


def _df_clean_df_spaces(df: pd.DataFrame):
    for column in df.columns:
        if df[column].dtype.name == 'string':
            df[column].apply(_clean_spaces)

def _df_set_types(df: pd.DataFrame):
    df['consumption (lt/100km)'] = df['consumption (lt/100km)'].astype('Float64')
    df["battery capacity (kwh)"] = df["battery capacity (kwh)"].astype('Float64')
    df["autonomous range (km)"] = df["autonomous range (km)"].astype('Int64')
    df['price (EUR)'] = df['price (EUR)'].astype('Int64')
    df['hp'] = df['hp'].astype('Int64')
    for column in df.columns:
        if (
            df[column].dtype.name == "object"
            and df[column].apply(is_hashable).all()
        ):
            df[column] = df[column].astype('string')


def clean_df(df: pd.DataFrame):
    _df_clean_df_spaces(df)
    _df_set_types(df)


In [26]:
# check if the page is empty
url = 'https://flexcar.gr/cars/?lang=en&category=MIN&category=HAT&category=SED&category=SUV&category=LUX&page=1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content)

In [27]:
# if there is an 'h2' we  can break
if soup.find_all('h2'):
    print('yes')

In [28]:
def is_page_empty(soup: BeautifulSoup) -> bool:
    return bool(soup.find_all('h2'))

In [29]:
import logging

logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler("logflex.txt"))
logger.setLevel(logging.INFO)

In [30]:
results = []
page = 1
page_template = "https://flexcar.gr/cars/?lang=en&category=MIN&category=HAT&category=SED&category=SUV&category=LUX&page={}"
first_page = page_template.format(page)
base_url = "https://flexcar.gr"
# url_options = "?lang=en"
response = requests.get(first_page, headers=headers)
soup = BeautifulSoup(response.content)
while not is_page_empty(soup):
    logger.info(f"page number: {page}")
    car_links = get_car_links(soup)
    for link in car_links:
        url = base_url + link  # + url_options
        logger.info(f"processing {url}")
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content)
        try:
            fuel_type = get_fuel_type(soup)
            results.append(
                {
                    "brand": get_car_brand(soup),
                    "model": get_car_model(soup),
                    "version": get_model_version(soup),
                    "price (EUR)": get_price(soup),
                    "fuel_type": fuel_type,
                    "hp": get_hp(soup),
                    "engine_size (cc)": (
                        get_engine_size(soup)
                        if fuel_type != "electric"
                        else None
                    ),
                    "consumption (lt/100km)": (
                        get_consumption(soup)
                        if fuel_type != "electric"
                        else None
                    ),
                    "battery capacity (kwh)": (
                        get_capacity(soup)
                        if fuel_type == "electric"
                        else None
                    ),
                    "autonomous range (km)": (
                        get_autonomy(soup)
                        if fuel_type == "electric"
                        else None
                    ),
                    "gearbox": get_gearbox(soup),
                    "highlights": get_highlights(soup),
                    "specifications": get_specs(soup),
                    "link": url,
                }
            )
        except:
            logger.info(f"error {url}")
            raise
        time.sleep(random.uniform(0.3, 0.4))    

    page += 1
    next_page = page_template.format(page)
    response = requests.get(next_page, headers=headers)
    soup = BeautifulSoup(response.content)

In [31]:
df = pd.DataFrame(results)

In [32]:
df.head(5)

Unnamed: 0,brand,model,version,price (EUR),fuel_type,hp,engine_size (cc),consumption (lt/100km),battery capacity (kwh),autonomous range (km),gearbox,highlights,specifications,link
0,Opel,Astra,Plug In Hybrid Petrol (GS Line),680,petrol,180,1600.0,1.2,,,automatic,"[Cruise control, Apple car play, Android auto,...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/opel/opel-astra-gs-lin...
1,VW,Polo,Petrol (Life),430,petrol,95,1000.0,5.2,,,manual,"[Apple car play, Android auto]","[Air conditioning, Electrically adjustable / h...",https://flexcar.gr/cars/vw/vw-polo-life/?lang=en
2,VW,T-Roc,Petrol (Life),520,petrol,110,1000.0,6.0,,,manual,"[Cruise control, Apple car play, Android auto,...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/vw/vw-t-roc-life/?lang=en
3,Renault,Arkana,Hybrid Petrol (E-Tech Techno),610,petrol,145,1600.0,4.8,,,automatic,"[Cruise control, Navigation, Apple car play, A...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/renault/renault-arkana...
4,Smart,ForTwo,Electric (Pulse),495,electric,80,,,20.0,132.0,automatic,[Cruise control],"[Autonomous range (kms): 132, Automatic air co...",https://flexcar.gr/cars/smart/smart-for-two-el...


In [33]:
clean_df(df)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   brand                   170 non-null    string 
 1   model                   170 non-null    string 
 2   version                 170 non-null    string 
 3   price (EUR)             170 non-null    Int64  
 4   fuel_type               169 non-null    string 
 5   hp                      170 non-null    Int64  
 6   engine_size (cc)        149 non-null    string 
 7   consumption (lt/100km)  149 non-null    Float64
 8   battery capacity (kwh)  21 non-null     Float64
 9   autonomous range (km)   21 non-null     Int64  
 10  gearbox                 169 non-null    string 
 11  highlights              170 non-null    object 
 12  specifications          170 non-null    object 
 13  link                    170 non-null    string 
dtypes: Float64(2), Int64(3), object(2), string

In [35]:
df.head(5)

Unnamed: 0,brand,model,version,price (EUR),fuel_type,hp,engine_size (cc),consumption (lt/100km),battery capacity (kwh),autonomous range (km),gearbox,highlights,specifications,link
0,Opel,Astra,Plug In Hybrid Petrol (GS Line),680,petrol,180,1600.0,1.2,,,automatic,"[Cruise control, Apple car play, Android auto,...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/opel/opel-astra-gs-lin...
1,VW,Polo,Petrol (Life),430,petrol,95,1000.0,5.2,,,manual,"[Apple car play, Android auto]","[Air conditioning, Electrically adjustable / h...",https://flexcar.gr/cars/vw/vw-polo-life/?lang=en
2,VW,T-Roc,Petrol (Life),520,petrol,110,1000.0,6.0,,,manual,"[Cruise control, Apple car play, Android auto,...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/vw/vw-t-roc-life/?lang=en
3,Renault,Arkana,Hybrid Petrol (E-Tech Techno),610,petrol,145,1600.0,4.8,,,automatic,"[Cruise control, Navigation, Apple car play, A...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/renault/renault-arkana...
4,Smart,ForTwo,Electric (Pulse),495,electric,80,,,20.0,132.0,automatic,[Cruise control],"[Autonomous range (kms): 132, Automatic air co...",https://flexcar.gr/cars/smart/smart-for-two-el...


In [36]:
# Save dataframe to pickle
df.to_pickle('data2.pkl')

In [325]:
# Load dataframe from pickle
df = pd.read_pickle('data2.pkl')

In [40]:
df[(df['price (EUR)'] < 500) & (df['engine_size (cc)']  == "1400")]

Unnamed: 0,brand,model,version,price (EUR),fuel_type,hp,engine_size (cc),consumption (lt/100km),battery capacity (kwh),autonomous range (km),gearbox,highlights,specifications,link
21,Suzuki,Swift,Mild Hybrid (Sport),440,petrol,129,1400,5.5,,,manual,"[Cruise control, Navigation, Apple car play, A...","[Automatic air conditioning, Front / rear powe...",https://flexcar.gr/cars/suzuki/suzuki-swift-mi...


In [41]:
df.to_csv('flexcar.ccsv')