In [76]:
from capmonstercloudclient import CapMonsterClient, ClientOptions
from capmonstercloudclient.requests import RecaptchaV2ProxylessRequest
import aiohttp
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from urllib.parse import urlparse, parse_qs
import pypeln as pl
from urllib.parse import urljoin
from enum import Enum
import json


In [77]:
class UrlCrawler(Enum):
    Goaffpro = 'https://allpowers.goaffpro.com/login'
    Meross_Goaffpro = 'https://meross-affiliate.goaffpro.com/login'
    Shoutout = 'https://www.shoutout.global/login'
    Uppromote = 'https://af.uppromote.com/solar-power-store-canada/login'

    def __str__(self):
        return self.name

    @property
    def loginAPI(self):
        if self in (UrlCrawler.Goaffpro, UrlCrawler.Meross_Goaffpro):
            return 'https://api2.goaffpro.com/partner/login'
        if self is UrlCrawler.Shoutout:
            return 'https://www.shoutout.global/checklogin'        
        if self is UrlCrawler.Uppromote:
            return 'https://af.uppromote.com/solar-power-store-canada/login_aff'  

    @property
    def dataAPI(self):
        if self in (UrlCrawler.Goaffpro, UrlCrawler.Meross_Goaffpro):
            return 'https://api2.goaffpro.com/partner/sales/summary/1672506000000/1701502813931'
        if self is UrlCrawler.Shoutout:
            return 'https://www.shoutout.global/userdashboard'        
        if self is UrlCrawler.Uppromote:
            return 'https://af.uppromote.com/solar-power-store-canada/dashboard'   


In [78]:
class DataCrawler:
    def __init__(self, data):
        self.data = data

    # step 1: solve captcha
    async def solve_captcha(self, WEBSITE_URL, WEBSITE_KEY):
        API_KEY = "b238f538e55b7deb0da93267f61d8763"
        client_options = ClientOptions(api_key=API_KEY)
        cap_monster_client = CapMonsterClient(options=client_options)
        recaptcha2request = RecaptchaV2ProxylessRequest(
            websiteUrl= WEBSITE_URL,
            websiteKey= WEBSITE_KEY
        )
        return await cap_monster_client.solve_captcha(recaptcha2request)
    
    # step 2: Login and get authentication
    def extract_user_id(self, html_content):
        html_str = html_content.decode()
        match = re.search(r'id=([a-f\d]+)', html_str)
        if match:
            return match.group(1)
        else:
            return None

    def extract_cookies_from_header(self, set_cookie_header):
        cookies = re.findall(r'Set-Cookie: (.*?);', set_cookie_header)
        cookies_string = "; ".join(cookies)
        return cookies_string
    
    def extract_id_from_html(self, html_content):
        match = re.search(r'/userdashboard\?id=([a-zA-Z0-9]+)', html_content)
        if match:
            id_value = match.group(1)
            return id_value
        else:
            return None
        
    async def getAuthFromResponse(self, url, response):
        if url in (UrlCrawler.Goaffpro.loginAPI, UrlCrawler.Meross_Goaffpro.loginAPI):
            if response.status == 200:
                data = await response.json()
                token = data.get("access_token")
                if token:
                    return token
                else:
                    print("Authentication failed.")
            else:
                print(f"Error {response.status}: {await response.text()}")
            return None
        elif url == UrlCrawler.Shoutout.loginAPI:
            if response.status == 200:
                html_content = await response.text()
                id = self.extract_id_from_html(html_content)
                if id:
                    return id
                else:
                    print("Không tìm thấy id.")
            else:
                print(f"Error {response.status}: {await response.text()}")
            return None
        elif url == UrlCrawler.Uppromote.loginAPI:
            if response.status == 302:
                cookies = response.cookies
                if cookies:
                    return self.extract_cookies_from_header(str(cookies))
                else:
                    print("Không tìm thấy token.")
                    return None, None
            else:
                print(f"Error {response.status}: {response.text()}")
                response.raise_for_status()
                return None, None

    async def LoginAndGetAuthAsync(self, url, payload, headers):
        async with aiohttp.ClientSession() as session:
            async with session.post(url, data=payload, headers=headers, allow_redirects=False) as response:
                return await self.getAuthFromResponse(url, response)
    
    # step 3: fetch data
    def get_first_and_last_day(self, year, month):
        if not 1 <= month <= 12:
            raise ValueError("Month must be in the range 1 to 12.")

        if month == 12:
            next_month = datetime(year + 1, 1, 1)
        else:
            next_month = datetime(year, month + 1, 1)

        last_day = next_month - timedelta(days=1)
        return datetime(year, month, 1), last_day

    async def fetch_data(self, url, **kwargs):
        if url in (UrlCrawler.Goaffpro.value, UrlCrawler.Meross_Goaffpro.value):
            crawlUrl = f"{UrlCrawler.Goaffpro.dataAPI}?startDate={kwargs.get('startDate', '').strftime('%Y-%m-%d')}&endDate={kwargs.get('endDate', '').strftime('%Y-%m-%d')}"
            headers = {
                'Origin': urljoin(url, '/'),
                'Authorization': f'Bearer {kwargs.get("token", "")}'
            }
            async with aiohttp.ClientSession() as session:
                async with session.get(crawlUrl, headers=headers) as response:
                    data = await response.text()
                    return data
        elif UrlCrawler.Shoutout.value in url:
            async with aiohttp.ClientSession() as session:
                async with session.get(f"{UrlCrawler.Shoutout.dataAPI}?id={kwargs.get('id', '')}") as response:            
                    if response.status == 200:
                        html_content = await response.text()
                        soup = BeautifulSoup(html_content, 'html.parser')
                        
                        total_revenue_element = soup.find(id='totalRevenueTxt')
                        return {
                            "salesCommissionTxt": soup.select('.card .card-body h1.card-title')[0].get_text(strip=True),
                            "leadTxt": soup.select('.card .card-body h1.card-title')[1].get_text(strip=True),
                            "totalRevenueTxt": total_revenue_element.get_text(strip=True) if total_revenue_element is not None else None,
                            "totalCommissionTxt": soup.select('.card .card-body .col-12 h2')[1].get_text(strip=True),
                            "pendingCommissionTxt": soup.select('.card .card-body .col-12 h2')[2].get_text(strip=True)
                        }
                    else:
                        print(f"Error: {response.status}")
                        return None
        elif url == UrlCrawler.Uppromote.value:
            headers = {'Cookie': kwargs.get('cookie', '')}
            async with aiohttp.ClientSession() as session:
                async with session.get(UrlCrawler.Uppromote.dataAPI, headers=headers) as response:         
                    if response.status == 200:
                        html_content = await response.text()
                        soup = BeautifulSoup(html_content, 'html.parser')
                        
                        selected_elements = soup.select('#commission .panel-body__pending, #commission .panel-body__approved, #commission .panel-body__paid')
                        return {label_element.text.strip(): element.text.strip() for element in selected_elements if (label_element := element.find_next(class_='my-0')) is not None}
                    else:
                        print(f"Error: {response.status}")
                        return None

    # Crawl data func
    async def crawl_data(self, args):
        if len(args)<=0:
            print("Insufficient number of arguments.")
            return None
        url, email, password = args
        if url in (UrlCrawler.Goaffpro.value, UrlCrawler.Meross_Goaffpro.value):
            WEBSITE_KEY = '6Lf_jsQUAAAAAOLW40PpDXgZQDIjjnGldAE1fhYr'
            res = await self.solve_captcha(url,WEBSITE_KEY)
            payload = {
                "email": email,
                "password": password,
                "partner_portal_subdomain": "allpowers.goaffpro.com",
                "recaptcha_response": res["gRecaptchaResponse"]
            }
            headers = {
                'Origin': urljoin(url, '/')
            }
            
            token = await self.LoginAndGetAuthAsync(UrlCrawler.Meross_Goaffpro.loginAPI, payload, headers)
            if token:
                first_day, last_day = self.get_first_and_last_day(datetime.now().year, datetime.now().month)
                data = await self.fetch_data(url, startDate=first_day, endDate=last_day, token=token)                
                return data
            else:
                print("Failed to obtain token.")
                return None
        elif UrlCrawler.Shoutout.value in url:
            WEBSITE_KEY = '6LfvfrEUAAAAAPg5Dt1q3UsmCwD_Z5oELX4s95eB'
            encryptedID = parse_qs(urlparse(url).query)
            res = await self.solve_captcha(url,WEBSITE_KEY)
            payload = {
                "email": email,
                "password": password,
                "g-recaptcha-response": res["gRecaptchaResponse"],
                "encryptedID": encryptedID["id"][0]
            }
            headers = {
                'Origin': urljoin(url, '/')
            }                
            id = await self.LoginAndGetAuthAsync(UrlCrawler.Shoutout.loginAPI, payload, headers)
            if id:
                data = await self.fetch_data(url, id=id)
                return data
            else:
                print("Failed to obtain ID.")
                return None
        elif url == UrlCrawler.Uppromote.value:
            WEBSITE_KEY = '6LcfFqkaAAAAAODkHHT2DLE7UBeSbf7kVCdBkTQE'
            res = await self.solve_captcha(url, WEBSITE_KEY)
            payload = {
                "_token": "5xhC3w0BB9Cezh7fhtZJg5YaHJvyInW5AC4qy8Mi",
                "shop_id": "80375",
                "email": email,
                "password": password,
                "g-recaptcha-response": res["gRecaptchaResponse"]
            }
            headers = {}

            cookie = await self.LoginAndGetAuthAsync(UrlCrawler.Uppromote.loginAPI, payload, headers)

            return await self.fetch_data(url, cookie=cookie)

    async def crawl(self):
        result = await pl.task.map(self.crawl_data, data, workers=100)
        print(json.dumps(result, indent=2))


Main

In [79]:
data = [
    ("https://allpowers.goaffpro.com/login", "natashacook371sdas@gmail.com", "Qxwg0CN09v"),
    ("https://meross-affiliate.goaffpro.com/login", "natashacook371sdas@gmail.com", "Qxwg0CN09v"),
    ("https://www.shoutout.global/login?id=22wbe", "natashacook371sdas@gmail.com", "Qxwg0CN09v"), 
    ("https://www.shoutout.global/login?id=obbi7", "teamasmads@gmail.com", "E9vQRQmPG!a.7m6"),
    ("https://af.uppromote.com/solar-power-store-canada/login", "teamasmads@gmail.com", "2N*G5k$7ux5j2!F")
]
crawler = DataCrawler(data)
await crawler.crawl()


[
  {
    "salesCommissionTxt": "1%",
    "leadTxt": "0",
    "totalRevenueTxt": "$0.00",
    "totalCommissionTxt": "$0.00",
    "pendingCommissionTxt": "$0.00"
  },
  {
    "salesCommissionTxt": "30%",
    "leadTxt": "up to5%",
    "totalRevenueTxt": "$0.00",
    "totalCommissionTxt": "$0.00",
    "pendingCommissionTxt": "$0.00"
  },
  {
    "Pending": "$0",
    "Approved": "$0",
    "Paid": "$0"
  },
  "{\"referrals\":0,\"revenue\":0,\"num_orders\":0,\"commission\":0,\"net_earnings\":0}",
  "{\"referrals\":0,\"revenue\":0,\"num_orders\":0,\"commission\":0,\"net_earnings\":0}"
]
