In [108]:
# Reference list
# Achin Sagar, Extracting Flipkart reviews through web scraping, Quora, accessed April 8, 2018, https://codeexperiments.quora.com/Extracting-Flipkart-reviews-through-web-scraping

In [1]:
import requests
import sys
from bs4 import BeautifulSoup as bs
import time

In [2]:
# From lab 3
# (Uses list comprehension to parse out the cleaned text strings from a list of elements returned from a BeautifulSoup selection.)
def get_text_from_elements(elements):
    return [e.text.strip() for e in elements]

In [3]:
payload = {'q':'laptop'}
r = requests.get('http://www.flipkart.com/search', params = payload)
page = r.content
soup = bs(page, "html.parser")

In [4]:
# Count the number of pages of the search result
link_pages = soup.find_all("span", class_ = "_3v8VuN")
link_pages = get_text_from_elements(link_pages)
# To split the string 'Page 3 of 26' by space, extract a row from the list of one row ('link_pages')
link_pages = link_pages[0].split(" ")
link_pages = int(link_pages[-1])
print("# of pages is", link_pages)

# of pages is 26


In [5]:
# Create empty lists
products_table = []

# For each search result page, get all the review rating, title, and price data
for i, page in enumerate(range(link_pages)):
    page_num = i + 1
    link = "/search?page=" + str(page_num) + "&q=laptop&viewType=list"
    r = requests.get('http://www.flipkart.com' + link)
    page = r.content
    soup = bs(page, "html.parser")

    # Get product data (eliminating recently reviewed items and recommended items)
    products = soup.find_all('div', class_ = "_1-2Iqu row")
    for product in products:
        # Remove products which have not review ratings
        if len(product.find_all("div", class_ = "hGSR34 _2beYZw")) != 0:
            # Get review rating data
            review_element = product.find_all("div", class_ = "hGSR34 _2beYZw")
            review_text = review_element[0].text
            review = review_text.split(" ")[0]
            review = float(review)
            # Get title data
            title = product.find_all("div", class_ = "_3wU53n")[0].text
            # Get Price data
            price = product.find_all("div", class_ = "_1vC4OE _2rQ-NK")[0].text
            # Merge the data
            product_table = [title, review, price]
            # Mutate the data table
            products_table.append(product_table)
    
print(products_table)
print(len(products_table))

[['Dell Inspiron Core i3 6th Gen - (4 GB/1 TB HDD/Linux) 3467 Laptop', 4.1, '₹25,990'], ['HP 15 Core i5 8th Gen - (8 GB/1 TB HDD/DOS) 15-BS145TU Laptop', 4.2, '₹38,990'], ['Lenovo Ideapad Core i5 7th Gen - (8 GB/1 TB HDD/Windows 10 Home/2 GB Graphics) IP 320 Laptop', 4.3, '₹44,990'], ['HP 15 Pentium Quad Core - (4 GB/1 TB HDD/DOS) 15Q-BU009TU Laptop', 4.1, '₹21,990'], ['Lenovo Core i5 7th Gen - (8 GB/1 TB HDD/DOS/2 GB Graphics) IP 320E Laptop', 4.3, '₹40,990'], ['Lenovo Ideapad Core i3 6th Gen - (4 GB/1 TB HDD/DOS) IP 320E Laptop', 4.1, '₹25,990'], ['HP 15 Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) 15Q-bu013TU Laptop', 4.1, '₹29,990'], ['Dell Inspiron 14 3000 Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) 3467 Laptop', 4.1, '₹29,990'], ['Lenovo Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) IP 320E Laptop', 4.1, '₹29,990'], ['HP 15 Core i3 6th Gen - (8 GB/1 TB HDD/DOS) 15Q-BU006TU Laptop', 4.2, '₹30,990'], ['Acer Aspire 3 Celeron Dual Core - (2 GB/500 GB HDD/Linux) A315-31

In [6]:
# Uniqueness check using 'dict'
product_name = {}
count = 0
for product in products_table:
    if product[0] not in product_name:
        product_name[product[0]] = 1
    else:
        product_name[product[0]] += 1   
print("# of unique products is", len(product_name), "out of", len(products_table))

# of unique products is 419 out of 444


In [8]:
# Convert into Pandas Data frame
import pandas as pd
products_table_df = pd.DataFrame(products_table, columns = ["Name", "Ratings", "Price"])
print(products_table_df)

                                                  Name  Ratings    Price
0    Dell Inspiron Core i3 6th Gen - (4 GB/1 TB HDD...      4.1  ₹25,990
1    HP 15 Core i5 8th Gen - (8 GB/1 TB HDD/DOS) 15...      4.2  ₹38,990
2    Lenovo Ideapad Core i5 7th Gen - (8 GB/1 TB HD...      4.3  ₹44,990
3    HP 15 Pentium Quad Core - (4 GB/1 TB HDD/DOS) ...      4.1  ₹21,990
4    Lenovo Core i5 7th Gen - (8 GB/1 TB HDD/DOS/2 ...      4.3  ₹40,990
5    Lenovo Ideapad Core i3 6th Gen - (4 GB/1 TB HD...      4.1  ₹25,990
6    HP 15 Core i3 6th Gen - (4 GB/1 TB HDD/Windows...      4.1  ₹29,990
7    Dell Inspiron 14 3000 Core i3 6th Gen - (4 GB/...      4.1  ₹29,990
8    Lenovo Core i3 6th Gen - (4 GB/1 TB HDD/Window...      4.1  ₹29,990
9    HP 15 Core i3 6th Gen - (8 GB/1 TB HDD/DOS) 15...      4.2  ₹30,990
10   Acer Aspire 3 Celeron Dual Core - (2 GB/500 GB...      3.9  ₹15,990
11   HP APU Dual Core A9 - (4 GB/1 TB HDD/Windows 1...      3.8  ₹24,990
12   HP 15 Core i3 6th Gen - (4 GB/1 TB HDD/DOS) 15

In [9]:
# Output to CSV file as raw data
products_table_df.to_csv("flipkart_raw.csv")

In [24]:
# Data Cleaning
# Remove duplicated products
products_table_df2 = products_table_df.drop_duplicates("Name")

# Sort by product name
products_table_df2 = products_table_df2.sort_values(by="Name")

In [25]:
# Output to CSV file as processed data
products_table_df2.to_csv("flipkart_clean.csv")

In [244]:
######################################
### All codes below is for testing.###
######################################

In [256]:
# For each search result page, get all the review rating, title, and price data
link = "/search?page=3&q=laptop&viewType=list"
r = requests.get('http://www.flipkart.com' + link)
page = r.content
soup = bs(page, "html.parser")
#    soup = soup.find_all("div", class_ = "_1XdvSH _17zsTh")
    
# Get product title data
products_table = []
products = soup.find_all('div', class_ = "_1-2Iqu row")
for product in products:
    if len(product.find_all("div", class_ = "hGSR34 _2beYZw")) != 0:
        # Review rating data
        review_element = product.find_all("div", class_ = "hGSR34 _2beYZw")
        review_text = review_element[0].text
        review = review_text.split(" ")[0]
        review = float(review)
        # Title data
        title = product.find_all("div", class_ = "_3wU53n")[0].text
        price = product.find_all("div", class_ = "_1vC4OE _2rQ-NK")[0].text
        product_table = [title, review, price]
        products_table.append(product_table)
print(products_table)      

[['Lenovo Ideapad Core i3 6th Gen - (4 GB/1 TB HDD/DOS/2 GB Graphics) IP 320 Laptop', 4.4, '₹32,490'], ['Lenovo IdeaPad Pentium Quad Core - (4 GB/1 TB HDD/DOS) 110-15IBR Laptop', 3.4, '₹19,990'], ['Dell Insprion Core i7 7th Gen - (8 GB/1 TB HDD/Windows 10/2 GB Graphics) 3567 Laptop', 4.0, '₹51,990'], ['Acer Aspire 3 Celeron Dual Core - (2 GB/500 GB HDD/Windows 10) A315-31 Laptop', 3.9, '₹17,990'], ['Acer Aspire 3 Pentium Quad Core - (4 GB/500 GB HDD/Linux) A315-31 Laptop', 4.0, '₹18,990'], ['Lenovo Yoga 510 Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) Yoga 510 2 in 1 Laptop', 4.1, '₹38,990'], ['HP Core i5 7th Gen - (8 GB/1 TB HDD/Windows 10 Home/4 GB Graphics) 15-au623tx Laptop', 4.3, '₹53,990'], ['Asus EeeBook Celeron Dual Core - (2 GB/32 GB EMMC Storage/Windows 10 Home) E203NA-FD026T Laptop', 3.8, '₹13,990'], ['Micromax Atom Quad Core - (2 GB/32 GB EMMC Storage/Windows 10 Home) Canvas L1160 Laptop', 3.9, '₹8,990'], ['HP 15 Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) 15-BS6

In [None]:
# Data loading
payload = {'q':'laptop'}
r = requests.get('http://www.flipkart.com/search', params = payload)
page = r.content
soup = bs(page, "html.parser")

In [91]:
# Test
reviews = soup.find_all("div", class_ = "hGSR34 _2beYZw")
reviews_text_raw = get_text_from_elements(reviews)
reviews_text = []
for review in reviews_text_raw:
    review_text = review.split(" ")[0]
    reviews_text.append(review_text)
#print(reviews_text_raw)
print(reviews_text)
print(len)

['4.1', '4.1', '4', '4.1', '4', '3.7', '3.9', '3.8', '3.9', '3.9', '4.4', '3.6', '4.4', '4.5', '4.6', '4.1', '4.4', '3.5', '3.6', '4', '4.3', '4.1', '3.9', '4.6', '3.9', '3.8', '3.7', '4.1']
28


In [92]:
# Test
titles = soup.find_all("div", class_ = "_3wU53n")
titles_text = get_text_from_elements(titles)
#print(titles_text)

titles = soup.find_all("div", class_ = "_2hJpKs")
titles_text_popular = get_text_from_elements(titles)
for product in titles_text_popular:
    titles_text.append(product)
print(titles_text)
print(len(titles_text))

['Dell Inspiron Pentium Quad Core - (4 GB/500 GB HDD/Ubuntu) 3552 Notebook', 'Lenovo Ideapad Core i3 6th Gen - (8 GB/1 TB HDD/Windows 10 Home/2 GB Graphics) IP 320 Laptop', 'Asus FX503 Core i7 7th Gen - (8 GB/1 TB HDD/Windows 10 Home/4 GB Graphics) FX503VD-DM111T Gaming Lapto...', 'Acer Core i5 7th Gen - (8 GB/1 TB HDD/Linux) E5 - 575 Laptop', 'HP APU Quad Core A8 - (4 GB/1 TB HDD/Windows 10 Home) 15-BG004AU Laptop', 'Asus APU Dual Core E1 - (4 GB/500 GB HDD/Windows 10 Home) X540YA-XO547T Laptop', 'Acer Switch One Atom Quad Core - (2 GB/32 GB EMMC Storage/Windows 10 Home) SW110-1CT 2 in 1 Laptop', 'Dell Inspiron 15 5000 Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) 5567 Laptop', 'Dell Inspiron Core i5 7th Gen - (4 GB/1 TB HDD/Windows 10 Home/2 GB Graphics) 3567 Notebook', 'Dell Inspiron 15 3000 Series Core i5 7th Gen - (4 GB/1 TB HDD/Ubuntu/2 GB Graphics) 3567 Laptop', 'Lenovo Ideapad Core i5 7th Gen - (4 GB/1 TB HDD/Windows 10 Home) IP 320E-15IKB Laptop', 'HP 15 Core i3 6th Gen - 

In [46]:
# Test
prices = soup.find_all("div", class_ = "_1vC4OE _2rQ-NK")
prices_text = get_text_from_elements(prices)
#print(prices_text)

prices = soup.find_all("div", class_ = "_1vC4OE _1DTbR5")
prices_text_popular = get_text_from_elements(prices)
for product in prices_text_popular:
    prices_text.append(product)
print(prices_text)
print(len(prices_text))


['₹25,990', '₹38,990', '₹17,990', '₹55,990', '₹44,990', '₹20,990', '₹24,990', '₹40,990', '₹25,490', '₹28,990', '₹29,990', '₹28,990', '₹30,990', '₹15,990', '₹28,990', '₹24,990', '₹26,990', '₹21,990', '₹25,490', '₹27,990', '₹67,990', '₹34,990', '₹77,990', '₹40,990', '₹67,990', '₹27,990', '₹15,990', '₹12,490', '₹38,990']
29


In [64]:
# Test
product_table = []
for i, line in enumerate(titles_text):
    product_table.append([titles_text[i], reviews_text[i], prices_text[i]])
print(product_table)

[['Dell Inspiron Core i3 6th Gen - (4 GB/1 TB HDD/Linux) 3467 Laptop', '4.1', '₹25,990'], ['HP 15 Core i5 8th Gen - (8 GB/1 TB HDD/DOS) 15-BS145TU Laptop', '4.2', '₹38,990'], ['HP 15 APU Dual Core E2 - (4 GB/500 GB HDD/DOS) 15Q-BY001AU Laptop', '3.6', '₹17,990'], ['Apple MacBook Air Core i5 5th Gen - (8 GB/128 GB SSD/Mac OS Sierra) MQD32HN/A A1466', '4.7', '₹55,990'], ['Lenovo Ideapad Core i5 7th Gen - (8 GB/1 TB HDD/Windows 10 Home/2 GB Graphics) IP 320 Laptop', '4.3', '₹44,990'], ['HP 15 Pentium Quad Core - (4 GB/1 TB HDD/DOS) 15Q-BU009TU Laptop', '4.1', '₹20,990'], ['Acer Aspire 3 Core i3 7th Gen - (4 GB/500 GB HDD/Linux) A315-51 Laptop', '4.3', '₹24,990'], ['Lenovo Core i5 7th Gen - (8 GB/1 TB HDD/DOS/2 GB Graphics) IP 320E Laptop', '4.3', '₹40,990'], ['Lenovo Ideapad Core i3 6th Gen - (4 GB/1 TB HDD/DOS) IP 320E Laptop', '4.1', '₹25,490'], ['HP 15 Core i3 6th Gen - (4 GB/1 TB HDD/Windows 10 Home) 15Q-bu013TU Laptop', '4.1', '₹28,990'], ['Dell Inspiron 14 3000 Core i3 6th Gen - (4 