# Scraping olx cars data by using requests, beautiful soup and pandas!

In [1]:
# importing packages/modules/libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# url of the webpage to be scraped
URL = 'https://www.olx.com.pk/cars_c84'
# defining headers to be sent to the webpage
HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 'Accept-Language':'en-US, en;q=0.5'})

In [3]:
# sending http get request to the webpage

webpage = requests.get(URL, headers=HEADERS)

In [4]:
# ensuring request is successful and we go the response

webpage.status_code

200

In [5]:
# get the content from the webpage, parse into html and give a structure to it using beautifulsoup

soup = BeautifulSoup(webpage.content, 'html.parser')

In [6]:
# getting all div tags containing links

links = soup.find_all('div', attrs={'class':'_9bea76df'})

In [7]:
# getting all anchor tags containing links

final_links = []
for link in links:
    final_links.append(link.find('a'))

In [8]:
final_links

[<a href="/item/honda-civic-vti-oriel-manual-2004-model-2008-reg-iid-1076020736" title="Honda Civic Vti Oriel Manual 2004 model 2008 Reg"><div class="c0e6d63c"></div></a>,
 <a href="/item/suzuki-alto-2010-model-iid-1075784732" title="Suzuki Alto 2010 model"><div class="c0e6d63c"></div></a>,
 <a href="/item/nissan-moco-2017-model-iid-1074919906" title="NISSAN MOCO 2017 MODEL"><div class="c0e6d63c"></div></a>,
 <a href="/item/cultus-euro-ii-iid-1075821624" title="Cultus Euro II"><div class="c0e6d63c"></div></a>,
 <a href="/item/suzuki-alto-2010-iid-1075981952" title="Suzuki Alto 2010"><div class="c0e6d63c"></div></a>,
 <a href="/item/total-geniune-iid-1076017221" title="HONDA CIVIC VTI ORIEL PROSMATEC 2013"><div class="c0e6d63c"></div></a>,
 <a href="/item/honda-city-13-automatic-iid-1073145363" title="Honda City  2005 - Automatic IDSI 1.3"><div class="c0e6d63c"></div></a>,
 <a href="/item/suzuki-cultus-vxr-2022-model-for-sale-iid-1075229422" title="CULTUS VXR 2022 MODEL FOR SALE"><div c

In [9]:
# getting final lnk of first item

firstlink = final_links[1].get('href')

In [25]:
links_list = []
for link in final_links:
    links_list.append(link.get('href'))
links_list

['/item/honda-civic-vti-oriel-manual-2004-model-2008-reg-iid-1076020736',
 '/item/suzuki-alto-2010-model-iid-1075784732',
 '/item/nissan-moco-2017-model-iid-1074919906',
 '/item/cultus-euro-ii-iid-1075821624',
 '/item/suzuki-alto-2010-iid-1075981952',
 '/item/total-geniune-iid-1076017221',
 '/item/honda-city-13-automatic-iid-1073145363',
 '/item/suzuki-cultus-vxr-2022-model-for-sale-iid-1075229422',
 '/item/honda-city-automatic-bumper-to-bumper-original-iid-1075969196',
 '/item/suzuki-bolan-mnc-387-18-iid-1076013392',
 '/item/suzuki-mehran-vxr-2018-model-iid-1075882410',
 '/item/toyota-xli-for-sale-iid-1076016864',
 '/item/suzuki-alto-vxl-2019-model-reg-2020-for-sale-iid-1076001515',
 '/item/gli-automatic-17-model-iid-1075962553',
 '/item/suzuki-cultus-2007-iid-1075740802',
 '/item/mint-condition-toyota-vitz-push-start-f-package-10-for-sale-askari-4-iid-1075480853',
 '/item/honda-city-13-auto-2018-model-iid-1074907768',
 '/item/suzuki-alto-2010-iid-1075152482',
 '/item/total-original-p

In [10]:
# generating new url

URL = 'https://www.olx.com.pk' + firstlink

In [11]:
# again sending get request to this new url

new_webpage = requests.get(URL, headers=HEADERS)

In [12]:
new_webpage.status_code

200

In [13]:
# get the content from the webpage, parse into html and give a structure to it using beautifulsoup

soup = BeautifulSoup(new_webpage.content, 'html.parser')

In [28]:
title = soup.find('h1', attrs={'class':'a38b8112'}).string.text.strip()


In [29]:
title

'Suzuki Alto 2010 model'

In [16]:
data = soup.find_all('div', attrs={'class':'b44ca0b3'})

In [17]:
new_data = []
for d in data:
    new_data.append(d.find_all('span'))

In [18]:
new_data

[[<span>Make</span>, <span>Suzuki</span>],
 [<span>Model</span>, <span>Alto</span>],
 [<span>Year</span>, <span>2010</span>],
 [<span>KM's driven</span>, <span>115,000</span>],
 [<span>Price</span>, <span>2,070,000</span>],
 [<span>Fuel</span>, <span>Petrol</span>],
 [<span>Registration city</span>, <span>Islamabad</span>],
 [<span>Car documents</span>, <span>Original</span>],
 [<span>Assembly</span>, <span>Imported</span>],
 [<span>Transmission</span>, <span>Automatic</span>],
 [<span>Condition</span>, <span>Used</span>]]

In [50]:
def append_keys_to_dict(existing_dict, sublists):
    for sublist in sublists:
        key = sublist[0].text
        existing_dict[key] = []
    return existing_dict

In [None]:
my_dict = {"item":"ali"}
append_keys_to_dict(my_dict, new_data)

In [19]:
new_data[0][0].text

'Make'

In [20]:
new_data[0][1].text

'Suzuki'

In [21]:
my_dict = {}
my_dict[new_data[0][0].text] = new_data[0][1].text

In [22]:
my_dict

{'Make': 'Suzuki'}