# Short demo of Flipkart scraper

In [1]:
from bs4 import BeautifulSoup
import requests
import re
from time import sleep
import csv

Getting started with a simple query of a computer monitor

In [2]:
url = "https://www.flipkart.com/search?q=monitor"

In [3]:
response = requests.get(url)

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
cards = soup.find_all('a', '_31qSD5')

Prototype a single card

In [6]:
card = cards[0]

In [7]:
url = 'https://www.flipkart.com' + card['href']
url

'https://www.flipkart.com/lg-21-5-inch-full-hd-ips-panel-monitor-22mk600m/p/itm297ef0f41b753?pid=MONFK842GG8RUHQQ&lid=LSTMONFK842GG8RUHQQW4GCES&marketplace=FLIPKART&srno=s_1_1&otracker=search&fm=organic&iid=16ec2e20-0d5c-4a8b-8b60-aa2c0d7a6603.MONFK842GG8RUHQQ.SEARCH&ssid=j6yli0ayz40000001604940535198&qH=08b5411f848a2581'

In [8]:
title = card.find('img').get('alt')
title

'LG 21.5 inch Full HD IPS Panel Monitor (22MK600M)'

In [9]:
description = "; ".join([tag.text for tag in card.find_all('li')])
description

'Panel Type: IPS Panel; Screen Resolution Type: Full HD; HDMI; Brightness: 250 nits; Response Time: 5 ms | Refresh Rate: 75 Hz; HDMI Ports - 2; 3 Years Manufacture Warranty'

In [10]:
price = card.find('div', text=re.compile(r'₹')).text
price

'₹6,999'

In [11]:
ratings = card.find('span', text=re.compile(r'Ratings')).text.strip()
ratings

'1,226 Ratings'

In [12]:
reviews = card.find('span', text=re.compile(r'Reviews')).text.strip()
reviews

'251 Reviews'

Generalize model for all cards

In [13]:
def get_card_data(card):
    """Extract data from each card"""
    title = card.find('img').get('alt')
    description = "; ".join([tag.text for tag in card.find_all('li')])
    price = card.find('div', text=re.compile(r'₹')).text
    try:
        ratings = card.find('span', text=re.compile(r'Ratings')).text.strip()
    except AttributeError:
        ratings = ""
    try:
        reviews = card.find('span', text=re.compile(r'Reviews')).text.strip()
    except AttributeError:
        reviews = ""
    url = 'https://www.flipkart.com' + card['href']
    return (title, description, price, ratings, reviews, url)

How to get the next page?

In [14]:
next_page = 'https://www.flipkart.com' + soup.find('span', text="Next").find_parent()['href']
next_page

'https://www.flipkart.com/search?q=monitor&page=2'

Consolidate this all into a single function

In [15]:
def extract_page_data(html_text):
    """Extract and return data from the current page"""
    soup = BeautifulSoup(html_text)
    cards = soup.find_all('a', '_31qSD5')
    data = [get_card_data(card) for card in cards]
    try:
        next_page = 'https://www.flipkart.com' + soup.find('span', text="Next").find_parent()['href']
    except AttributeError:
        next_page = None
    return data, next_page

## Putting it all together

In [16]:
response = requests.get('https://www.flipkart.com/search?q=monitor')

product_data = []

while True:
    data, next_page = extract_page_data(response.text)
    product_data.extend(data)
    if next_page:
        sleep(0.5) # delay a half second to prevent spamming the site
        response = requests.get(next_page, 'html.parser')
        continue
    else:
        break

In [17]:
# how many products were scraped?
len(product_data)

270

In [18]:
product_data[0]

('LG 21.5 inch Full HD IPS Panel Monitor (22MK600M)',
 'Panel Type: IPS Panel; Screen Resolution Type: Full HD; HDMI; Brightness: 250 nits; Response Time: 5 ms | Refresh Rate: 75 Hz; HDMI Ports - 2; 3 Years Manufacture Warranty',
 '₹6,999',
 '1,226 Ratings',
 '251 Reviews',
 'https://www.flipkart.com/lg-21-5-inch-full-hd-ips-panel-monitor-22mk600m/p/itm297ef0f41b753?pid=MONFK842GG8RUHQQ&lid=LSTMONFK842GG8RUHQQW4GCES&marketplace=FLIPKART&srno=s_1_1&otracker=search&fm=organic&iid=ceb86ece-7f2e-43ca-a852-a3817138cb71.MONFK842GG8RUHQQ.SEARCH&ssid=2crv6qbpo00000001604940576824&qH=08b5411f848a2581')

Save the data

In [20]:
with open('product_data.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'Description', 'Price', 'Rating', 'Reviews', 'URL'])
    writer.writerows(product_data)