# FlipKart Web Scraping
#### Using Python

* Web Scraping is also known as web harvesting, or web data extraction is a process of extracting data from websites.
* This is a project which uses python to extract data from **FlipKart**.

Every code chunk is explained below

In [None]:
# import libraries 

from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib
import csv

### 2. Step 2 is to set the URL variable to the URL from which we want to extract the data
* **Requests** is the package which provides a method **GET** to get the HTML page of the link
* In the **try-catch** block send a request to the link to get a copy of the HTML page 
* It is also a good practice to add hearders to the GET method
* The result is not formated
* To convert it to a formatted HTML code we use **BeautifulSoup**
* Now soup2 has the prettified version of the HTML code

In [4]:
# connect to website
URL = "https://www.flipkart.com/search?q=shoes&sid=osp%2Ccil%2Ce1f&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_2_5_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_2_5_na_na_na&as-pos=2&as-type=RECENT&suggestionId=shoes%7CCasual+Shoes&requestId=c04f4b6f-d0dd-4f01-8f17-27adb9693103&as-searchtext=shoes"

# system info link to find this --> "https://httpbin.org/get"
header = {"User-Agent": "fill this with your system info"}

try:
    page = requests.get(URL, headers = header)
except Exception as e:
    print(e)

soup1 = BeautifulSoup(page.content, "html.parser") #lxml for broken html pages
soup2 = BeautifulSoup(soup1.prettify(),"html.parser")

### 3. Extract the required Info from the HTML code using tags and class names
* Different attributes like product name, brand name or the discount can be extracted
* But first, we need to get the div which has all this info
* In order to find all the different items we need to use find all and it returns a list of the items

In [None]:
items = soup2.find_all('div', {'class': '_1xHGtK _373qXS'})

### 4. Open a CSV file in which all the data will be stored and add the headers i.e. the name of the columns which will be entered in the file

In [None]:
# open a csv file and add the headers
header = ['brand_name','product_name','original_price', 'discounted_price', 'discount','link','date']
with open('FlipKartWebScraper.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)


### 5. Use a for loop to iterate all the items and the required details of the product
* To get a particular item from the list find is the method to use
* It takes the name of the tag and the class name
* It returns the whole tag to get the text use .text after the find method
* The data will have extra white spaces to remove that use the strip method to get the required data
* Numeric data should be converted into integers before storing
* Colle

In [None]:
for item in items:
    brand_name = item.find('div',{'class': '_2WkVRV'}).text
    link = item.find('a',{'class': 'IRpwTa'})['href']
    product_name = item.find('a',{'class': 'IRpwTa'})['title']
    discounted_price = item.find('div',{'class': '_30jeq3'}).text
    original_price = item.find('div',{'class': '_3I9_wc'}).text
    discount = item.find('div',{'class': '_3Ay6Sb'}).text
    
    # clean the data
    brand_name = product_name.strip()
    product_name = product_name.strip()
    today = datetime.date.today()
    original_price = int(original_price.strip()[1:].replace(',',''))
    discounted_price = int(discounted_price.strip()[1:].replace(',',''))
    discount = int(discount.strip()[0:2])
    link = link.strip()
   
    # append the data to the csv file
    data = [brand_name, product_name, original_price, discounted_price, discount, link, today]
    with open('FlipKartWebScraper.csv', 'a+', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

### To sum it up
* The above code will add the data one time when we run the code
* Web Scraping can be used to keep track of a product or the price of a stock etc
* To do this the above code must repeat itself after an interval
* To repeat the code, use the time package as shown below

In [None]:
def check_price():
    URL = "https://www.flipkart.com/search?q=shoes&sid=osp%2Ccil%2Ce1f&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_2_5_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_2_5_na_na_na&as-pos=2&as-type=RECENT&suggestionId=shoes%7CCasual+Shoes&requestId=c04f4b6f-d0dd-4f01-8f17-27adb9693103&as-searchtext=shoes"
    header = {"User-Agent": "fill this with your system info"}

    try:
        page = requests.get(URL, headers = header)
    except Exception as e:
        print(e)

    soup1 = BeautifulSoup(page.content, "html.parser") #lxml package for broken html pages
    soup2 = BeautifulSoup(soup1.prettify(),"html.parser")
    
    items = soup2.find_all('div', {'class': '_1xHGtK _373qXS'})
    
    header = ['brand_name','product_name','original_price', 'discounted_price', 'discount','link','date']
    with open('FlipKartWebScraper.csv', 'w', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        
    for item in items:
        brand_name = item.find('div',{'class': '_2WkVRV'}).text
        link = item.find('a',{'class': 'IRpwTa'})['href']
        product_name = item.find('a',{'class': 'IRpwTa'})['title']
        discounted_price = item.find('div',{'class': '_30jeq3'}).text
        original_price = item.find('div',{'class': '_3I9_wc'}).text
        discount = item.find('div',{'class': '_3Ay6Sb'}).text

        # clean the data
        brand_name = product_name.strip()
        product_name = product_name.strip()
        today = datetime.date.today()
        original_price = int(original_price.strip()[1:].replace(',',''))
        discounted_price = int(discounted_price.strip()[1:].replace(',',''))
        discount = int(discount.strip()[0:2])
        link = link.strip()

        # append the data to the csv file
        data = [brand_name, product_name, original_price, discounted_price, discount, link, today]
        with open('FlipKartWebScraper.csv', 'a+', newline='', encoding='UTF8') as f:
            writer = csv.writer(f)
            writer.writerow(data)


In [None]:
#check price every day
while(True):
    import time
    check_price()
    time.sleep(86400)

> #### This is for any website change the url and attribute names and values accordingly