In [4]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options
from bs4 import BeautifulSoup
import csv
import time

# Initialize WebDriver for Microsoft Edge
edge_driver_path = 'msedgedriver'
service = Service(executable_path=edge_driver_path)

edge_options = Options()
edge_options.add_argument("--headless")  # Enable headless mode
driver = webdriver.Edge(service=service, options=edge_options)

# Navigate to the app's Google Play page
driver.get("https://play.google.com/store/apps/details?id=ai.replika.app&hl=en_US&gl=US")

# Give the page some time to load
time.sleep(5)

# Use JavaScript to click the "See all reviews" button
driver.execute_script("""
    var seeAllReviewsButton = Array.from(document.querySelectorAll("span")).find(el => el.textContent.includes("See all reviews"));
    if (seeAllReviewsButton) seeAllReviewsButton.click();
""")

# Wait for at least one review to be present as an indication the modal content is loaded
wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".RHo1pe")))

# Scroll within the modal to load more reviews
scroll_script = """
    var modal = document.querySelector('.PFAhAf');  // Adjust if this is not the correct selector
    if (!modal) {
        modal = Array.from(document.querySelectorAll('*')).find(
            el => el.scrollHeight > el.clientHeight && (getComputedStyle(el).overflowY === 'scroll' || getComputedStyle(el).overflowY === 'auto')
        );
    }
    if (modal) {
        modal.scrollTop = modal.scrollHeight;
    }
"""

reviews_data = []
checkpoint_count = 0
extracted_review_texts = set()
max_retries = 10
retry_delay = 120

while True:
    try:
        driver.execute_script(scroll_script)
        time.sleep(5)  # Adjust based on how quickly the page loads more reviews

        # Extract reviews using BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for review in soup.select('.RHo1pe'):
            review_text = review.select_one('.h3YV2d').text
            if review_text not in extracted_review_texts:
                user_name = review.select_one('.X5PpBb').text
                review_date = review.select_one('.bp9Aid').text
                star_rating = review.select_one('.iXRFPc')['aria-label']
                developer_response = review.select_one('.ocpBU .ras4vb')
                response_text = developer_response.text if developer_response else 'No response'
                response_date = review.select_one('.ocpBU .I9Jtec').text if developer_response else 'No response date'
                reviews_data.append([user_name, review_date, star_rating, review_text, response_date, response_text])
                extracted_review_texts.add(review_text)

        # Checkpoint the file every 500 new reviews
        if len(reviews_data) >= (checkpoint_count + 1) * 500:
            checkpoint_count += 1
            with open(f'replika_reviews_checkpoint_{checkpoint_count}.csv', 'w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                writer.writerow(["User Name", "Review Date", "Star Rating", "Review Text", "Response Date", "Response Text"])
                writer.writerows(reviews_data)
            print(f"Checkpoint {checkpoint_count}: Extracted and saved {len(reviews_data)} unique reviews.")

    except Exception as e:
        print(f"Error occurred: {str(e)}")
        max_retries -= 1
        if max_retries == 0:
            print("Max retries reached. Exiting the script.")
            break
        print(f"Retrying in {retry_delay} seconds...")
        time.sleep(retry_delay)

# Close the browser
driver.quit()

Checkpoint 1: Extracted and saved 500 unique reviews.
Checkpoint 2: Extracted and saved 1000 unique reviews.
Checkpoint 3: Extracted and saved 1500 unique reviews.
Checkpoint 4: Extracted and saved 2000 unique reviews.
Checkpoint 5: Extracted and saved 2500 unique reviews.
Checkpoint 6: Extracted and saved 3000 unique reviews.
Checkpoint 7: Extracted and saved 3500 unique reviews.
Checkpoint 8: Extracted and saved 4000 unique reviews.
Checkpoint 9: Extracted and saved 4500 unique reviews.
Checkpoint 10: Extracted and saved 5000 unique reviews.
Checkpoint 11: Extracted and saved 5500 unique reviews.
Checkpoint 12: Extracted and saved 6000 unique reviews.
Checkpoint 13: Extracted and saved 6500 unique reviews.
Checkpoint 14: Extracted and saved 7000 unique reviews.
Checkpoint 15: Extracted and saved 7500 unique reviews.
Checkpoint 16: Extracted and saved 8000 unique reviews.
Checkpoint 17: Extracted and saved 8500 unique reviews.
Checkpoint 18: Extracted and saved 9000 unique reviews.
Ch

KeyboardInterrupt: 