In [1]:
!apt-get update
!apt-get install -y chromium-browser
!apt-get install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/chromedriverAQ\BaA\

!pip install selenium
!pip install beautifulsoup4

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to cloud.r-project.org] [Connected to ppa.launchpadcontent.net                                                                                                     Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                                                    Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
                                                                                                    Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 https://cloud.r-project.org/bin/

In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from typing import List, Tuple, Dict, Any
from threading import Lock

class DriverSingleton:
    """Singleton class for managing a single instance of a Selenium WebDriver.

    This class ensures that only one instance of the WebDriver is created and shared across the application.
    It provides a thread-safe implementation to handle concurrent access.
    """
    _instance: 'DriverSingleton' = None
    _lock: Lock = Lock()

    def __new__(cls) -> 'DriverSingleton':
        """Create a new instance of DriverSingleton or return the existing instance.

        Returns:
            DriverSingleton: The singleton instance of this class.
        """
        with cls._lock:
            if cls._instance is None:
                cls._instance = super(DriverSingleton, cls).__new__(cls)
                cls._instance.driver = cls._init_driver()
            return cls._instance

    @staticmethod
    def _init_driver() -> webdriver.Chrome:
        """Initialize the Selenium WebDriver with specific options.

        Returns:
            WebDriver: A configured instance of Chrome WebDriver.
        """
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1920x1080')
        options.add_argument('--disable-extensions')
        options.add_argument('--proxy-server="direct://"')
        options.add_argument('--proxy-bypass-list=*')
        options.add_argument('--start-maximized')
        options.add_argument('--disable-infobars')
        options.add_argument('--disable-browser-side-navigation')
        options.add_argument('--disable-logging')
        options.add_argument('--remote-debugging-port=9222')
        return webdriver.Chrome(options=options)

    def quit(self) -> None:
        """Quit the WebDriver instance and reset the singleton instance."""
        if self.driver:
            self.driver.quit()
            DriverSingleton._instance = None

def Scraper(cls):
    """Decorator function to initialize and close the WebDriver for a scraper class.

    This decorator ensures that the WebDriver instance from DriverSingleton is initialized
    and made available to the scraper class.

    Args:
        cls (type): The class to be decorated.

    Returns:
        type: The wrapped class with WebDriver initialization.
    """
    class Wrapped(cls):
        def __init__(self, *args, **kwargs):
            self.driver = DriverSingleton().driver
            super().__init__(*args, **kwargs)

    return Wrapped

In [3]:
from typing import Dict, Optional

class LanguageFactory:
    """Factory class for managing search query templates in different languages.

    This class provides a way to manage and retrieve query templates in various languages.
    It supports dynamic generation of search queries based on the provided parameters.

    Attributes:
        languages (Dict[str, Dict[str, str]]): A dictionary storing query templates for different languages.
    """

    def __init__(self):
        """Initialize the LanguageFactory with predefined query templates."""
        self.languages: Dict[str, Dict[str, str]] = {
            "EN": {
                "search_query": "{name} {category} in {location}",
                "category_query": "{category} in {location}"
            },
            "ES": {
                "search_query": "{category} {name} en {location}",
                "category_query": "{category} en {location}"
            }
        }

    def get_query(self, query_type: str, language: str, **kwargs) -> Optional[str]:
        """Get the query template for the specified language and query type.

        Args:
            query_type (str): The type of query template to retrieve (e.g., 'search_query', 'category_query').
            language (str): The language code for the desired query template (e.g., 'EN', 'ES').
            **kwargs: Additional keyword arguments to format the query template.

        Returns:
            Optional[str]: The formatted query string if the template is found, otherwise None.
        """
        query_template = self.languages.get(language, {}).get(query_type)
        if query_template:
            return query_template.format(**kwargs)
        return None


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.webdriver import WebDriver
from bs4 import BeautifulSoup
import time
import pandas as pd
from typing import List, Dict, Any, Optional

@Scraper
class GoogleMapsScraperSearchPlaces:
    """Class for scraping place information from Google Maps.

    This class uses the Selenium WebDriver to interact with Google Maps and scrape information about various places
    based on specified search queries.

    Attributes:
        div_class (str): CSS class for place result div elements.
        scrollable_div_class (str): CSS class for the scrollable div containing the results.
        result_item_class (str): CSS class for result item elements.
        end_of_list_text (str): Text indicating the end of the result list.
        rating_class (str): CSS class for rating elements.
        reviews_count_class (str): CSS class for reviews count elements.
        category_class (str): CSS class for category elements.
        website_data_value (str): Data value attribute for website elements.
        initial_wait (int): Initial wait time in seconds after loading the page.
        scroll_wait (int): Wait time in seconds after scrolling the page.
        language_factory (LanguageFactory): Instance of LanguageFactory to get search query templates.
        driver (WebDriver): Selenium WebDriver instance.
    """

    def __init__(self) -> None:
        self.div_class: str = 'Nv2PK'
        self.scrollable_div_class: str = 'm6QErb.DxyBCb.kA9KIf.dS8AEf'
        self.result_item_class: str = 'hfpxzc'
        self.end_of_list_text: str = "You've reached the end of the list."
        self.rating_class: str = 'MW4etd'
        self.reviews_count_class: str = 'UY7F9'
        self.category_class: str = 'W4Efsd'
        self.website_data_value: str = 'Website'
        self.language_factory: LanguageFactory = LanguageFactory()

        self.initial_wait: int = 10
        self.scroll_wait: int = 5

    def _search_places(self, query_type: str, name: Optional[str] = None, category: Optional[str] = None, location: Optional[str] = None, language: str = "EN") -> List[Dict[str, Any]]:
        """Private method to perform the search and scrape place data.

        Args:
            query_type (str): The type of query to perform.
            name (Optional[str]): The name of the place to search for.
            category (Optional[str]): The category of the place to search for.
            location (Optional[str]): The location to search in.
            language (str): The language to use for the search query.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries containing place data.
        """
        search_query: str = self.language_factory.get_query(query_type, language, name=name, category=category, location=location)
        search_url: str = f"https://www.google.com/maps/search/{search_query}"
        print(f'---search_url: {search_query}')
        self.driver.get(search_url)
        time.sleep(self.initial_wait)

        scrollable_div = self.driver.find_element(By.CLASS_NAME, self.scrollable_div_class)

        all_place_data: List[Dict[str, Any]] = []
        processed_urls: set = set()

        while True:
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            divs = soup.find_all('div', class_=self.div_class)

            if not divs:
                break

            total_iterations = len(divs)
            last_index = len(all_place_data)

            for index, div in enumerate(divs):
                if index < last_index:
                    continue
                try:
                    place_name_element = div.find('a', class_=self.result_item_class)
                    place_name = place_name_element['aria-label'] if place_name_element else 'N/A'
                    link_element = place_name_element['href'] if place_name_element else 'N/A'

                    if link_element in processed_urls:
                        continue

                    processed_urls.add(link_element)

                    rating_element = div.find('span', class_=self.rating_class)
                    rating = rating_element.text if rating_element else 'N/A'

                    reviews_count_element = div.find('span', class_=self.reviews_count_class)
                    reviews_count = reviews_count_element.text.strip('()') if reviews_count_element else 'N/A'

                    category_element = div.find_all('div', class_=self.category_class)[1].find_all('span')[1]
                    category = category_element.text if category_element else 'N/A'

                    address_element = div.find_all('div', class_=self.category_class)[2].find_all('span')[-1]
                    address = address_element.text if address_element else 'N/A'
                    print(f'---Iteration: {index + 1} of {total_iterations} -----> {address}')

                    hours_element = None
                    for span in div.find_all('span'):
                        if 'Opens' in span.text or 'Closed' in span.text:
                            hours_element = span
                            break
                    hours = hours_element.text if hours_element else 'N/A'

                    website_element = div.find('a', href=True, attrs={'data-value': self.website_data_value})
                    website = website_element['href'] if website_element else 'N/A'

                    all_place_data.append({
                        'Name': name,
                        'Category': category,
                        'Location': location,
                        'Place Name': place_name,
                        'Link': link_element,
                        'Rating': rating,
                        'Reviews Count': reviews_count,
                        'Address': address,
                        'Hours': hours,
                        'Website': website
                    })
                except Exception as e:
                    print(f"Error processing place element: {e}")

            if self.driver.page_source.find(self.end_of_list_text) != -1:
                print("Reached the end of the list.")
                break

            results = self.driver.find_elements(By.CLASS_NAME, self.result_item_class)
            self.driver.execute_script("arguments[0].scrollIntoView(true);", results[-1])
            time.sleep(self.scroll_wait)

        return all_place_data

    def run(self, queries: List[Dict[str, Any]]) -> pd.DataFrame:
        """Method to execute the search and return the results as a DataFrame.

        Args:
            queries (List[Dict[str, Any]]): A list of query dictionaries specifying the search parameters.

        Returns:
            pd.DataFrame: A DataFrame containing the scraped place data.
        """
        all_place_data: List[Dict[str, Any]] = []

        for query in queries:
            query_type = query.get("query_type", "search_query")
            name = query.get("name")
            category = query.get("category")
            location = query.get("location")
            language = query.get("language", "EN")
            place_data = self._search_places(query_type, name, category, location, language)
            all_place_data.extend(place_data)

        DriverSingleton().quit()
        return pd.DataFrame(all_place_data)


In [5]:
def main():
    """Main function to execute the scraping for predefined queries."""
    scraper = GoogleMapsScraperSearchPlaces()
    queries = [
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Toronto Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Montreal Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Vancouver Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Calgary Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Edmonton Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Ottawa Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Winnipeg Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Quebec City Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Hamilton Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Kitchener Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "London Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Halifax Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Victoria Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Saskatoon Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Regina Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "St. John's Canada", "language": "EN"},
        {"query_type": "search_query", "name": "Scotiabank", "category": "", "location": "Windsor Canada", "language": "EN"}
    ]

    df = scraper.run(queries)
    return df

In [None]:
df_location = main()

---search_url: Scotiabank  in Toronto Canada
---Iteration: 1 of 12 -----> 44 King St W
---Iteration: 2 of 12 -----> 222 Queen St W
---Iteration: 3 of 12 -----> 145 King St W
---Iteration: 4 of 12 -----> 292 Spadina Ave.
---Iteration: 5 of 12 -----> 992 Bloor St W
---Iteration: 6 of 12 -----> 416 Spadina Rd
---Iteration: 7 of 12 -----> 19 Bloor St W
---Iteration: 8 of 12 -----> 279 King St E
---Iteration: 9 of 12 -----> 392 Bay St.
---Iteration: 10 of 12 -----> 643 College St
---Iteration: 11 of 12 -----> 522 University Ave
---Iteration: 12 of 12 -----> 41 Harbour Square
---Iteration: 13 of 20 -----> 2080 Queen St E
---Iteration: 14 of 20 -----> 410 Bathurst St Unit A2
---Iteration: 15 of 20 -----> 332 Bloor St W
---Iteration: 16 of 20 -----> 1241 St Clair Ave W
---Iteration: 17 of 20 -----> 1046 Queen St E
---Iteration: 18 of 20 -----> 720 King St W
---Iteration: 19 of 20 -----> 1 St Clair Ave E
---Iteration: 20 of 20 -----> 438 Eglinton Ave W
---Iteration: 21 of 32 -----> 1391 Lawrenc

In [None]:
df_location

Unnamed: 0,Name,Category,Location,Place Name,Link,Rating,Reviews Count,Address,Hours,Website
0,Scotiabank,Bank,Toronto Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.4,158,44 King St W,Closed ⋅ Opens 8 AM Mon,https://maps.scotiabank.com/locator
1,Scotiabank,Bank,Toronto Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.8,58,222 Queen St W,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
2,Scotiabank,Bank,Toronto Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.5,41,145 King St W,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
3,Scotiabank,Bank,Toronto Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.4,80,292 Spadina Ave.,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
4,Scotiabank,Bank,Toronto Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.7,61,992 Bloor St W,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
...,...,...,...,...,...,...,...,...,...,...
390,Scotiabank,Bank,Windsor Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.8,39,3745 Tecumseh Rd E,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
391,Scotiabank,Bank,Windsor Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,3.5,48,1570 Huron Church Rd,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
392,Scotiabank,Bank,Windsor Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,2.7,51,7191 Tecumseh Rd E,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator
393,Scotiabank,Bank,Windsor Canada,Scotiabank,https://www.google.com/maps/place/Scotiabank/d...,3.8,16,5795 Malden Rd,Closed ⋅ Opens 9:30 AM Mon,https://maps.scotiabank.com/locator


In [7]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [None]:
from gspread_dataframe import set_with_dataframe

sheet = gc.open('extracted_data')

sheet.del_worksheet(sheet.worksheet("placeSearchResults"))
worksheet = sheet.add_worksheet(title="placeSearchResults", rows= df_location.shape[0], cols = df_location.shape[1])
set_with_dataframe(worksheet, df_location)

In [44]:
def saveAsCSV(filename, dataframe):
    path = f'/content/drive/MyDrive/Colab Notebooks/ScrapperBanksComments/{filename}.csv'

    dataframe.to_csv(path, encoding='utf-8-sig', index=False)

In [None]:
saveAsCSV('placeSearchResults', df_location)

In [3]:
import pandas as pd

csv_file_path = '/content/drive/MyDrive/Colab Notebooks/ScrapperBanksComments/placeSearchResults.csv'
df_location = pd.read_csv(csv_file_path)

In [7]:
@Scraper
class GoogleMapsScraperDetails:
    def __init__(self):
        self.address_class = 'Io6YTe fontBodyMedium kR99db'
        self.review_class = 'jftiEf fontBodyMedium'
        self.review_scroll_panel = 'm6QErb DxyBCb kA9KIf dS8AEf XiKgde'
        self.initial_wait = 5  # Initial wait time in seconds after loading the page
        self.scroll_wait = 5  # Scroll wait time in seconds

    def get_place_details(self, url: str, max_reviews: int) -> Dict[str, Any]:
        self.driver.get(url)
        time.sleep(self.initial_wait)  # Wait for the page to load completely
        print('Page loaded, parsing content')

        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        address_element = soup.find('div', class_=self.address_class)
        address = address_element.text if address_element else 'N/A'
        print(f'Captured address: {address}')

        # Click on the "Reviews" tab
        try:
            tab_buttons = self.driver.find_elements(By.XPATH, "//div[@role='tablist']/button")
            reviews_tab_button = tab_buttons[1]  # The second button on the list
            self.driver.execute_script("arguments[0].scrollIntoView(true);", reviews_tab_button)
            time.sleep(1)  # Allow time for the button to be visible
            self.driver.execute_script("arguments[0].click();", reviews_tab_button)
            time.sleep(5)  # Wait for the comments tab to load
        except Exception as e:
            print(f'Error clicking on reviews tab: {e}')
            return {
                'URL': url,
                'Address': address,
                'Reviews': []
            }

        reviews = set()  # Use a set to avoid duplicates
        scroll_pause_time = 3  # Waiting time between iterations
        last_review_id = None  # ID of the last captured review
        iteration_count = 0  # Count iterations without new reviews

        while len(reviews) < max_reviews:
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            review_elements = soup.find_all('div', class_=self.review_class)
            print(f'Found {len(review_elements)} reviews on the page')

            new_reviews_captured = False  # Flag to check if new reviews have been captured in this iteration

            for review in review_elements:
                try:
                    review_id = review['data-review-id']
                    if review_id == last_review_id:
                        continue

                    reviewer_name = review.find('div', class_='d4r55').text if review.find('div', class_='d4r55') else 'N/A'
                    review_date = review.find('span', class_='rsqaWe').text if review.find('span', class_='rsqaWe') else 'N/A'
                    review_rating = review.find('span', class_='kvMYJc')['aria-label'] if review.find('span', class_='kvMYJc') else 'N/A'

                    more_button = review.find('button', {'aria-label': 'See more'})
                    if more_button:
                        selenium_review = self.driver.find_element(By.XPATH, f"//button[@aria-label='See more'][@data-review-id='{review_id}']")
                        selenium_review.click()
                        time.sleep(1)
                        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                        review = soup.find('div', {'data-review-id': review_id})

                    review_text = review.find('span', class_='wiI7pd').text if review.find('span', class_='wiI7pd') else 'N/A'

                    review_tuple = (reviewer_name, review_text, review_date, review_rating)
                    if review_tuple not in reviews:
                        reviews.add(review_tuple)
                        last_review_id = review_id
                        new_reviews_captured = True  # Mark that new reviews have been captured in this iteration
                        print(f'|-----> ({len(reviews)}/{max_reviews}) Captured review from {reviewer_name}')

                except Exception as e:
                    print(f'Error capturing review: {e}')

            if new_reviews_captured:
                iteration_count = 0  # Reset iteration count if new reviews are captured
            else:
                iteration_count += 1

            if iteration_count > 2:  # Break loop if no new reviews captured in 2 iterations
                print('No new reviews captured in 2 iterations, moving to next link.')
                return {
                    'URL': url,
                    'Address': address,
                    'Reviews': [
                        {'Reviewer Name': r[0], 'Review Text': r[1], 'Review Date': r[2], 'Review Rating': r[3]}
                        for r in reviews
                    ]
                }

            # If no new reviews were captured, try scrolling down
            if not new_reviews_captured:
                try:
                    scroll_panel = self.driver.find_element(By.CSS_SELECTOR, '.' + self.review_scroll_panel.replace(' ', '.'))
                    self.driver.execute_script("arguments[0].scrollBy(0, 500);", scroll_panel)
                    time.sleep(self.scroll_wait)
                except Exception as e:
                    print('Error scrolling the review panel:', e)
            else:
                # Scroll to the last captured review
                if last_review_id:
                    try:
                        last_review_element = self.driver.find_element(By.XPATH, f"//div[@data-review-id='{last_review_id}']")
                        self.driver.execute_script("arguments[0].scrollIntoView(true);", last_review_element)
                        time.sleep(scroll_pause_time)
                    except Exception as e:
                        print(f'Error scrolling to last review: {e}')

        return {
            'URL': url,
            'Address': address,
            'Reviews': [
                {'Reviewer Name': r[0], 'Review Text': r[1], 'Review Date': r[2], 'Review Rating': r[3]}
                for r in reviews
            ]
        }

    def run(self, url_tuples: List[Tuple[str, int]]):
        all_details = []
        results = []

        for index, (url, max_reviews) in enumerate(url_tuples):
            try:
                print(f'Processing {index + 1} of {len(url_tuples)}: with max {max_reviews} reviews')
                details = self.get_place_details(url, max_reviews)
                all_details.append(details)
                print(f'Finished processing {index + 1} of {len(url_tuples)}')

                # Save progress every 10 links
                if (index + 1) % 10 == 0:
                    df_partial = self._convert_to_dataframe(all_details)
                    results.append(df_partial)
                    all_details = []  # Clear details after saving

            except Exception as e:
                print(f'Error processing {url}: {e}')

        DriverSingleton().quit()
        print('WebDriver quit, converting final results to DataFrame')

        # Convert remaining details to a DataFrame
        if all_details:
            df_final = self._convert_to_dataframe(all_details)
            results.append(df_final)

        # Concatenate all DataFrames
        return pd.concat(results, ignore_index=True)

    def _convert_to_dataframe(self, details: List[Dict[str, Any]]) -> pd.DataFrame:
        """Convert the scraped details to a pandas DataFrame."""
        all_details_flat = []
        for detail in details:
            for review in detail['Reviews']:
                review_data = {
                    'URL': detail['URL'],
                    'Address': detail['Address'],
                    **review
                }
                all_details_flat.append(review_data)
        return pd.DataFrame(all_details_flat)

In [8]:
def mainScrapper_2() -> pd.DataFrame:
    scraper = GoogleMapsScraperDetails()

    # Generate list of tuples (Link, Reviews Count)
    url_tuples = list(zip(df_location['Link'], df_location['Reviews Count']))

    df = scraper.run(url_tuples)
    return df

In [43]:
df_comments_by_location = mainScrapper_2()

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
|-----> (61/105.0) Captured review from Joshua Etherington
|-----> (62/105.0) Captured review from Cameron Horton
|-----> (63/105.0) Captured review from SIMRAN KAUR
|-----> (64/105.0) Captured review from Ratko K
|-----> (65/105.0) Captured review from FineFellah
|-----> (66/105.0) Captured review from Susan Graham
|-----> (67/105.0) Captured review from Benjamin Duke
|-----> (68/105.0) Captured review from rani sam
|-----> (69/105.0) Captured review from Steve
|-----> (70/105.0) Captured review from Isabel Mercanti
Found 80 reviews on the page
|-----> (71/105.0) Captured review from BVA
|-----> (72/105.0) Captured review from Vince Barone
|-----> (73/105.0) Captured review from Ryy M
|-----> (74/105.0) Captured review from Curious Nervous
|-----> (75/105.0) Captured review from Ramandeep Kaur
|-----> (76/105.0) Captured review from 정진훈
|-----> (77/105.0) Captured review from Kulwinder Singh
|-----> (78/105.0) 

In [49]:
df_comments_by_location

Unnamed: 0,URL,Address,Reviewer Name,Review Text,Review Date,Review Rating
0,https://www.google.com/maps/place/Scotiabank/d...,"44 King St W, Toronto, ON M5H 1H1, Canada",Wael Coutry,,4 years ago,5 stars
1,https://www.google.com/maps/place/Scotiabank/d...,"44 King St W, Toronto, ON M5H 1H1, Canada",S McDonald,Dishonest and incompetent. Do business with th...,a year ago,1 star
2,https://www.google.com/maps/place/Scotiabank/d...,"44 King St W, Toronto, ON M5H 1H1, Canada",Jigar Pandya,Fake,3 years ago,1 star
3,https://www.google.com/maps/place/Scotiabank/d...,"44 King St W, Toronto, ON M5H 1H1, Canada",Schaad Othman,,5 months ago,1 star
4,https://www.google.com/maps/place/Scotiabank/d...,"44 King St W, Toronto, ON M5H 1H1, Canada",Matheus Davel,I am writing to formally express my frustratio...,11 months ago,1 star
...,...,...,...,...,...,...
20231,https://www.google.com/maps/place/Scotiabank/d...,"21 Amy Croft Dr, Tecumseh, ON N9K 1C7, Canada",Boloniy,,a year ago,1 star
20232,https://www.google.com/maps/place/Scotiabank/d...,"21 Amy Croft Dr, Tecumseh, ON N9K 1C7, Canada",Makboy Makedonija,Good service need more tellers,7 years ago,5 stars
20233,https://www.google.com/maps/place/Scotiabank/d...,"21 Amy Croft Dr, Tecumseh, ON N9K 1C7, Canada",Joni Lariviere,Website says this branch is open till 3pm on S...,2 months ago,1 star
20234,https://www.google.com/maps/place/Scotiabank/d...,"21 Amy Croft Dr, Tecumseh, ON N9K 1C7, Canada",Shan MOHAMMADI,Careless management,3 years ago,1 star


In [48]:
from gspread_dataframe import set_with_dataframe

sheet = gc.open('extracted_data')

sheet.del_worksheet(sheet.worksheet("CommentsByPlaceSearchResults"))
worksheet = sheet.add_worksheet(title="CommentsByPlaceSearchResults", rows= df_comments_by_location.shape[0], cols = df_comments_by_location.shape[1])
set_with_dataframe(worksheet, df_comments_by_location)

In [45]:
saveAsCSV('CommentsByPlaceSearchResults', df_comments_by_location)

In [4]:
import time
import pandas as pd
from urllib.parse import urlparse
from typing import List, Tuple

@Scraper
class GoogleMapsScraperDetails:
    def __init__(self):
        self.initial_wait = 5  # Initial wait time in seconds after loading the page

    def extract_lat_lng(self, url: str) -> Tuple[float, float]:
        """
        Navigate to the given URL, wait for the page to load, and extract the latitude and longitude from the current URL.

        Args:
            url (str): The initial Google Maps URL.

        Returns:
            Tuple[float, float]: Latitude and longitude extracted from the URL.
        """
        try:
            self.driver.get(url)
            time.sleep(self.initial_wait)  # Wait for the page to load completely

            # Wait a bit longer to ensure the URL has updated
            time.sleep(2)
            current_url = self.driver.current_url

            # Parse the URL to extract the latitude and longitude
            parsed_url = urlparse(current_url)
            path_parts = parsed_url.path.split('@')
            if len(path_parts) > 1:
                lat_lng_zoom = path_parts[1].split(',')[:2]  # Getting only the latitude and longitude
                lat = float(lat_lng_zoom[0])
                lng = float(lat_lng_zoom[1])
                return lat, lng
            else:
                raise ValueError("Could not extract latitude and longitude from the URL.")

        except Exception as e:
            print(f'Error extracting latitude and longitude: {e}')
            return None, None

    def run(self, urls: List[str]) -> pd.DataFrame:
        results = []

        for index, url in enumerate(urls):
            try:
                print(f'Processing {index + 1} of {len(urls)}')
                lat, lng = self.extract_lat_lng(url)
                print(f'|-----> ( {index + 1} of {len(urls)}) Captured location: lat {lat} - lng {lng}')
                results.append({
                    'URL': url,
                    'Latitude': lat,
                    'Longitude': lng
                })

                # Save progress every 10 URLs
                if (index + 1) % 10 == 0:
                    pd.DataFrame(results).to_csv(f'progress_{index + 1}.csv', index=False)

            except Exception as e:
                print(f'Error processing {url}: {e}')

        DriverSingleton().quit()
        print('WebDriver quit, converting results to DataFrame')

        return pd.DataFrame(results)

In [5]:
def mainScrapper_3() -> pd.DataFrame:
    scraper = GoogleMapsScraperDetails()

    urls = list(df_location['Link'])
    df = scraper.run(urls)
    return df

In [6]:
df_geo_location = mainScrapper_3()

Processing 1 of 395
Error extracting latitude and longitude: Could not extract latitude and longitude from the URL.
|-----> ( 1 of 395) Captured location: lat None - lng None
Processing 2 of 395
Error extracting latitude and longitude: Could not extract latitude and longitude from the URL.
|-----> ( 2 of 395) Captured location: lat None - lng None
Processing 3 of 395
Error extracting latitude and longitude: Could not extract latitude and longitude from the URL.
|-----> ( 3 of 395) Captured location: lat None - lng None
Processing 4 of 395
Error extracting latitude and longitude: Could not extract latitude and longitude from the URL.
|-----> ( 4 of 395) Captured location: lat None - lng None
Processing 5 of 395
Error extracting latitude and longitude: Could not extract latitude and longitude from the URL.
|-----> ( 5 of 395) Captured location: lat None - lng None
Processing 6 of 395
Error extracting latitude and longitude: Could not extract latitude and longitude from the URL.
|-----> (

KeyboardInterrupt: 