In [1]:
import os
import time
import uuid
import re
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions

In [9]:
class Scraper:
    def __init__(self, driver = "Edge", path = os.getcwd()):
        self.driver = driver
        self.path = path
        self.browser = None

    def next(self):
        self.browser.find_element(By.XPATH, '//span[@data-icon="chevron-left"]').click()

    def wait_until(self, xpath):
        WebDriverWait(self.browser, 100).until(
            expected_conditions.presence_of_element_located((By.XPATH, xpath)))

    def chrome_browser(self):
        pass

    def edge_browser(self):
        options = EdgeOptions()
        options.use_chromium = True
        # options.add_argument('--headless')
        options.add_argument('--disable-notifications')
        options.add_experimental_option("prefs", {
            "download.default_directory":  self.path,
            "download.prompt_for_download": "false",
            "download.directory_upgrade": "true",
            "safebrowsing.enabled": "true"})
        browser = Edge('msedgedriver.exe', options=options)
        return browser

    def access_group_and_gallery(self, group):
        # find de group
        text_box = '//div[@role="textbox"]'
        self.wait_until(text_box)
        title_group = f'//span[@title="{group}"]'
        data = self.browser.find_elements(By.XPATH, title_group)
        if not data:
            # search the group
            self.browser.find_element(By.XPATH, '//div[@role="textbox"]').send_keys(self.group)
        # access to the group
        self.wait_until(title_group)
        self.browser.find_element(By.XPATH, title_group).click()
        # access to the description of the group
        self.wait_until('//span[@data-testid="search-alt"]')
        self.browser.find_elements(By.XPATH, title_group)[1].click()
        # access to the gallery of the group
        chevron_right = '//span[@data-testid="chevron-right-alt"]'
        self.wait_until(chevron_right)
        time.sleep(1)
        self.browser.find_element(By.XPATH, chevron_right).click()
        # access to the first image from the gallery
        first_image = '//*[@class="_23fpc"]'
        self.wait_until(first_image)
        self.browser.find_element(By.XPATH, first_image).click()

    def select_browser(self):
        if self.driver == "Edge": return self.edge_browser()
        if self.driver == "Chrome": return self.chrome_browser()

    def find_download_button_in_image(self):
        # wait until the image is charged
        WebDriverWait(self.browser, 100).until_not(
            expected_conditions.visibility_of_element_located((By.XPATH, '//span[@data-icon="media-cancel"]')))
        download_button_in_image = self.browser.find_elements(By.XPATH, '//button[@class="_1ZhO6 e1p1w"]')
        return download_button_in_image

    def is_downloadable(self):
        disabled = self.browser.find_element(By.XPATH, '//div[@title="Download"]').get_attribute('aria-disabled')
        return False if disabled == 'true' else True

    def is_video(self):
        video = self.browser.find_elements(By.XPATH, '//video')
        return video

    def wait_until_image_downloaded(self):
        is_downloaded = False
        while not is_downloaded:
            files = os.listdir(self.path)
            if not files:
                continue
            image_file = files[-1]
            match = re.findall(r'WhatsApp Image', image_file)
            if match:
                is_downloaded = True
        # time.sleep(1)

    def rename_image_file(self, unique_id, extension):
        image_file = os.listdir(self.path)[-1]
        os.rename(f'{self.path}/{image_file}', f'{self.path}/{unique_id}.{extension}')

    def features_for_redis(self):
        image_file = os.listdir(self.path)[-1]
        name = self.browser.find_element(By.XPATH, '//span[@class="ggj6brxn gfz4du6o r7fjleex g0rxnol2 lhj4utae le5p0ye3 i0jNr"]').text
        name = name.split('@')[0].strip()
        date = image_file.split(' ')[2]
        unique_id = uuid.uuid4()
        extension = image_file.split(' ')[-1].split('.')[1]
        return name, date, unique_id, extension


    def meme_scraper(self, number_of_memes, group):
        self.browser = self.select_browser()
        self.browser.get('https://web.whatsapp.com/')
        self.access_group_and_gallery(group)

        while number_of_memes > 0:
            # check if there is a lost connection
            if self.find_download_button_in_image():
                raise "Lost connection"
            # check if the image is downloadable
            if not self.is_downloadable():
                self.next()
                continue
            #check if it is a video
            if self.is_video():
                self.next()
                continue

            # else download the meme
            self.browser.find_element(By.XPATH, '//span[@data-icon="download"]').click()
            time.sleep(1.2)
            number_of_memes -= 1
            # wait unit the image is downloaded
            self.wait_until_image_downloaded()
            # features for redis database
            name, date, unique_id, extension = self.features_for_redis()
            # rename the image file
            self.rename_image_file(unique_id, extension)
            self.next()

In [10]:
scraper = Scraper(path = os.getcwd() + '\memes')
scraper.meme_scraper(10, "Free memes")