### Lib imports

In [57]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import csv
from bs4 import BeautifulSoup
import pandas as pd
import re

### Main Script

In [None]:
# Headers for CSV file
HEADERS = ['title', 'type', 'address', 'rating']  # default

MARKUP_FILENAME = 'html-markup.txt'
CSV_DATA_FILENAME = 'data.csv'

# Set the city name, search to your own for which you want to get the data. Below mentioned is an example
CITY_NAME = 'Moscow'
SEARCH = 'hotels'

URL = f'https://2gis.ru/{CITY_NAME.lower()}/search/{SEARCH}?m'
# URL = 'https://2gis.ru/kazan/search/%D0%B6%D0%BA?m=49.255846%2C55.793323%2F10.89'

# Writes the headers to the CSV file
with open(CSV_DATA_FILENAME, 'w', encoding='utf-8', newline='') as f:

    csv_writer = csv.DictWriter(f, fieldnames=HEADERS)

    csv_writer.writeheader()

def extract_content(markup_text):
    sub_doc = BeautifulSoup(markup_text, 'html.parser')
    addresses = [k for k in sub_doc.find_all('span', class_='_14quei')]

    if addresses:
        x = re.findall('(<span class="_1w9o2igt">(.|\n)*?<\/span>)', str(addresses[0]))
        
        if x:
            return BeautifulSoup(str(x[0][0] if x[0][0] else x[0]), 'html.parser').text
        else:
            return 'null'
    else:
        return 'null'

# Reads the HTML file data from UI
def data_handler(page_number):
    with open(MARKUP_FILENAME, 'r', encoding='utf-8') as f:
        contents = f.read()

    doc = BeautifulSoup(contents, features='html.parser')  # The <doc> variable holds the markup data read from the HTML file

    parent_div_results = doc.find_all('div', class_='_1kf6gff')

    TITLES = []
    TYPES = []
    ADDRESSES = []
    RATINGS = []

    for idx, res_div in enumerate(parent_div_results):

        type = res_div.find('span', class_='_oqoid').text if res_div.find('span', class_='_oqoid') else 'null'
        title = res_div.find('span', class_='_1al0wlf').text if res_div.find('span', class_='_1al0wlf') else type
        address = extract_content(str(res_div.find('span', class_='_14quei')))
        rating = res_div.find('div', class_='_y10azs').text if res_div.find('div', class_='_y10azs') else 'null'

        # Appending each page data to their corresponding lists
        TITLES.append(title)
        TYPES.append(type)
        ADDRESSES.append(address)
        RATINGS.append(rating)


    # # Writes row data for current page
    with open(CSV_DATA_FILENAME, 'a', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)

        for m in range(1, len(TITLES)):
            writer.writerow([TITLES[m], TYPES[m], ADDRESSES[m], RATINGS[m]])

    print(f'\nfinished parsing page: {page_number}')  # Logger for parsing status


# Webdriver for chrome
browser = webdriver.Chrome()

# Maximazing allows the parser configurations to work more smoothly for avoiding 'no element exceptions'
browser.maximize_window()

# Opens the url using webdriver
browser.get(URL)

# Implicitly waiting for the page to load its contents in seconds
# Customize the time if your internet is slow
browser.implicitly_wait(40)


# page_element = browser.find_element(
#     By.XPATH, "(//span[@class='_18lf326a'])[1]")
page_element = browser.find_element(
    By.XPATH, "(//span[@class='_1xhlznaa'])[1]")

num_of_pages = (int(page_element.text)//12)+3  # Calculates the number of pages


try:

    # Accessing the scroll element in DOM to get the whole HTML markup so that beautifulSoup can parse it according to the headers list config
    try:
        scroll_container = browser.find_element(
            By.XPATH, "(//div[@class='_15gu4wr'])[3]")
    except NoSuchElementException:
        scroll_container = browser.find_element(
            By.XPATH, "(//div[@class='_15gu4wr'])[2]")
    finally:
        # Parses and navigates through all the pages
        for page in range(1, num_of_pages):
            with open(MARKUP_FILENAME, 'w', encoding='utf-8') as f:
                f.write(browser.page_source)

            # Handles the parsing and writing of CSV data
            data_handler(page)


            time.sleep(2)
            # Scrolls the <ul></ul> element
            browser.execute_script("arguments[0].scrollIntoView(false);", scroll_container)

            time.sleep(1)
            # Clicks on the next page <DOM element>
            browser.find_element(By.XPATH, "//div[@class='_5ocwns']//div[2]").click()

except IndexError:  # This error is triggered when the script crosses the page limit in the UI
    print(f'Total pages parsed {page-1}')
    time.sleep(3)
finally:
    time.sleep(2)
    browser.quit()  # Exiting the Chrome driver

### Delete duplicates if exists

In [67]:
# Deleting duplicate entries in the data CSV file if any, optional code block
df = pd.read_csv(CSV_DATA_FILENAME)

prev = df.shape[0]
# # Dropping duplicates inplace so as not to make any copies of the original CSV data
df.drop_duplicates(inplace=True)
df.to_csv(CSV_DATA_FILENAME, index=False)  # re-writing the cleaned data to the original file
print(f'\nduplicates dropped: {prev-df.shape[0]}')



duplicates dropped: 11
