### Lib imports

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
from bs4 import BeautifulSoup
import pandas as pd

### Main Script

In [None]:
# Setting the headers for the CSV file
HEADERS = ['title', 'type', 'address']  # default

# Dynamic variable for file name for saving the HTML markup text to save and csv data
MARKUP_FILENAME = 'html-markup.txt'
CSV_DATA_FILENAME = 'data.csv'

# Set the url you want to your own 2GIS url for which you want to get the data. Below mentioned is an example url from 2GIS
URL = 'https://2gis.ru/kazan/search/%D0%B6%D0%BA?m=49.255846%2C55.793323%2F10.89'

# Writing the headers to the CSV file
with open(CSV_DATA_FILENAME, 'w', encoding='utf-8', newline='') as f:

    csv_writer = csv.DictWriter(f, fieldnames=HEADERS)

    csv_writer.writeheader()

def data_helper(markup_data, headers_list):
    div_data_titles = markup_data.find_all('div', class_='_1hf7139')
    for idx, div in enumerate(div_data_titles):
        if '_oqoid' not in str(div):
            headers_list[1].insert(idx, 'NA')
        elif '_tluih8' not in str(div):
            headers_list[2].insert(idx, 'NA')
    

# This function basically reads the HTML file data from UI and uses its markup data for the beautifulSoup as an argument
def data_handler(page):
    with open(MARKUP_FILENAME, 'r', encoding='utf-8') as f:
        contents = f.read()

    doc = BeautifulSoup(contents, features='html5lib')  # The <doc> variable holds the markup data read from the HTML file

    titles = [k.text for k in doc.find_all('span', class_='_hc69qa')]

    types = [k.text for k in doc.find_all('span', class_='_oqoid')]

    addresses = [k.text for k in doc.find_all('span', class_='_tluih8')]

    # Checking for not-provided data
    if len(titles)!=len(types) or len(titles)!=len(addresses):
        data_helper(doc, [titles, types, addresses])
    # csv data row writing
    with open(CSV_DATA_FILENAME, 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)

        for m in range(1, len(titles)):
            writer.writerow([titles[m], types[m], addresses[m]])

    print(f'\nfinished parsing page: {page}')  # Gives the status for each page if the parsing is done

browser = webdriver.Chrome()  # Setting the webdriver for chrome

# Maximazing allows the parser configurations to work more smoothly for avoiding 'no element exceptions'
browser.maximize_window()

browser.get(URL)  # opening the url via webdriver

# Implicitly waiting for the page to load its contents in seconds
browser.implicitly_wait(10)  # customize the time if your internet is slow

page_element = browser.find_element(
    By.XPATH, "(//span[@class='_18lf326a'])[1]")

num_of_pages = (int(page_element.text)//12)+3  # calculates the number of pages to click

try:
    # This is the main loop and it works on page limitition of the corresponding data search passed to the webdriver
    for page in range(1, num_of_pages):
        with open(MARKUP_FILENAME, 'w', encoding='utf-8') as f:
            f.write(browser.page_source)

        data_handler(page)

        time.sleep(1.9)

        # getting the scroll element in DOM for getting the whole HTML markup so that beautifulSoup can parse it according to the headers list config
        scroll_container = browser.find_element(
            By.XPATH, "(//div[@class='_15gu4wr'])[3]")

        # scrolling the <ul></ul> element
        browser.execute_script("arguments[0].scrollIntoView(false);", scroll_container)

        # clicking on the next page DOM element
        browser.find_element(By.XPATH, "//div[@class='_5ocwns']//div[2]").click()
except IndexError:  # This error is triggered when the script crosses the page limit in the UI 
    print(f'Total pages parsed {page-1}')
    time.sleep(3)
    browser.quit()  # Exiting the driver runned Chrome

In [None]:
# Deleting duplicate entries in the data CSV file if any, optional code block
df = pd.read_csv(CSV_DATA_FILENAME)
prev = df.shape[0]
# Dropping duplicates inplace so as not to make any copies of the original CSV data
df.drop_duplicates(inplace=True)
df.to_csv(CSV_DATA_FILENAME, index=False)  # re-writing the cleaned data to the original file
print(f'\nduplicates dropped: {prev-df.shape[0]}')
