In [6]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd
from urllib.parse import urlparse, parse_qs

# Set up Chrome options
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Specify the path to your locally installed chromedriver
chromedriver_path = r'C:\New folder\chromedriver-win64\chromedriver.exe'  # Update this path
driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)

# List of URLs for different phones
phone_urls = {
    "Samsung Galaxy S23": "https://www.flipkart.com/samsung-galaxy-s23-5g-phantom-black-128-gb/product-reviews/itm1f3efe01d1c61?pid=MOBGNPGZVX4PCTTF&lid=LSTMOBGNPGZVX4PCTTFYWYWBL&marketplace=FLIPKART",
    "Vivo T3": "https://www.flipkart.com/vivo-t3-ultra-frost-green-256-gb/product-reviews/itme360ff5b7dbab?pid=MOBH4EACZ7SACMMM&lid=LSTMOBH4EACZ7SACMMMMVZG93&marketplace=FLIPKART",
    "Google Pixel 8": "https://www.flipkart.com/google-pixel-8-hazel-128-gb/product-reviews/itm67e2a2531aaac?pid=MOBGT5F2WD8HPTPZ&lid=LSTMOBGT5F2WD8HPTPZ4A9QHI&marketplace=FLIPKART",
    "Motorola Edge 50": "https://www.flipkart.com/motorola-edge-50-neo-pantone-grisaille-256-gb/product-reviews/itm5b85defa76389?pid=MOBHFHDRVUDB3HFX&lid=LSTMOBHFHDRVUDB3HFXLMEVTW&marketplace=FLIPKART",
    "Realme 12 Pro": "https://www.flipkart.com/realme-12-pro-5g-navigator-beige-256-gb/product-reviews/itmcc78f150eeabd?pid=MOBGYQ6BVDHRJRSG&lid=LSTMOBGYQ6BVDHRJRSGDKJZZD&marketplace=FLIPKART"
}

def fetch_html(url):
    """Fetch the HTML content of a given URL."""
    try:
        driver.get(url)
        time.sleep(2)  # Allow time for the page to load
        return BeautifulSoup(driver.page_source, "html.parser")
    except Exception as e:
        print(f"Error fetching the page: {e}")
        return None


def extract_reviews(soup):
    """Extract reviews from the parsed HTML page."""
    reviews = []
    if not soup:
        return reviews  # Return empty list if no content

    review_blocks = soup.find_all('div', {'class': 'cPHDOP col-12-12'})  # Updated class if structure changes
    for block in review_blocks:
        try:
            rating_elem = block.find('div', {'class': 'XQDdHH Ga3i8K'})
            review_elem = block.find('div', {'class': 'ZmyHeo'})
            
            
            if rating_elem and review_elem:
                review = {
                    'Rating': rating_elem.text.strip(),
                    'Review': review_elem.text.strip().replace('READ MORE', ''),
                    
                }
                reviews.append(review)
        except Exception as e:
            print(f"Error parsing a review block: {e}")
    return reviews
def extract_product_id(url):
    """Extract the product ID from the Flipkart URL."""
    parsed_url = urlparse(url)
    product_id = parse_qs(parsed_url.query).get('pid', [None])[0]
    return product_id

# Function to scrape reviews for a single phone
def scrape_reviews_for_phone(phone_name, phone_url):
    reviews = []
    product_id = extract_product_id(phone_url)
    page = 1
    print(f"Scraping reviews for: {phone_name} (Product ID: {product_id})")

    try:
        while True:
            page_url = f"{phone_url}&page={page}"
            print(f"Scraping page {page}...")
            soup = fetch_html(page_url)
            page_reviews = extract_reviews(soup)
            if not page_reviews:
                print(f"No more reviews found for {phone_name} or page structure changed.")
                break
            for review_data in page_reviews:
                # Add product ID and phone name to each review entry
                review_data['Product ID'] = product_id
                review_data['Phone Name'] = phone_name
            reviews.extend(page_reviews)
            page += 1
    except Exception as e:
        print(f"An error occurred during scraping for {phone_name}: {e}")
    return reviews

# List to hold all reviews from all phones
all_reviews = []

# Loop through each phone and scrape reviews
for phone_name, phone_url in phone_urls.items():
    phone_reviews = scrape_reviews_for_phone(phone_name, phone_url)
    all_reviews.extend(phone_reviews)

# Close the WebDriver after scraping all phones
driver.quit()

# Save all reviews into a single CSV file
if all_reviews:
    df = pd.DataFrame(all_reviews, columns=['Product ID', 'Phone Name', 'Rating', 'Review'])
    df.to_csv(r'E:\GUVI\FINAL PROJECT\combined_flipkart_phone_reviews.csv', index=False, encoding='utf-8')

    print("All reviews saved to 'combined_flipkart_phone_reviews.csv'.")
else:
    print("No reviews to save.")


Scraping reviews for: Samsung Galaxy S23 (Product ID: MOBGNPGZVX4PCTTF)
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
No more reviews found for Samsung Galaxy S23 or page structure changed.
Scraping reviews for: Vivo T3 (Product ID: MOBH4EACZ7SACMMM)
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
No more reviews found for Vivo T3 or page structure changed.
Scraping reviews for: Google Pixel 8 (Product ID: MOBGT5F2WD8HPTPZ)
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page

In [3]:
import pkg_resources
import sys

# List of packages to check, including Python
packages = [
    "beautifulsoup4",
    "selenium",
    "pandas",
    "textblob",
    "transformers",
    "streamlit",
    "matplotlib",
    "seaborn",
]

# Check Python version
print(f"Python: {sys.version}")

# Check versions of specified packages
for package in packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package} is not installed.")


Python: 3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]
beautifulsoup4: 4.12.3
selenium: 4.25.0
pandas: 2.2.2
textblob: 0.15.3
transformers: 4.45.1
streamlit: 1.32.0
matplotlib: 3.8.4
seaborn: 0.13.2
