In [44]:
import cloudscraper
from bs4 import BeautifulSoup

import pandas as pd
import multiprocessing as mp
from glob import glob

In [63]:
def get_captcha_provider(
    api_key=None,
    provider_name="2captcha"
):
    if api_key:
        return dict(
            provider=provider_name,
            api_key="YOUR_2CAPTCHA_API",
        )
    else:
        return None

In [74]:
CAPTCHA_API = None
CAPTCHA_PROVIDER = get_captcha_provider(CAPTCHA_API)
LOAD_DELAY=5

In [65]:
def get_urls(path, start=0, end=None, column_name='Location on Site', sep=','):
    urls = pd.read_csv(path, sep=sep)[column_name].tolist()
    if end is None:
        end = len(urls)
    
    return urls[start:end]

In [90]:
URLS = get_urls(glob('*.csv')[0])
print(len(URLS))
URLS[:5]

10087


['djbet888.com',
 'djbet9.co',
 'djbet9.com',
 'djbet9.net',
 'novawebmaroc.com.djbonafit.com;rtpbonafit88.pro.djbonafit.com;rtpbonafit88.xyz.djbonafit.com']

In [67]:
def get_url(url, secure=True):
    if '://' not in url:
        protocol = 'https' if secure else "http"
        url = protocol + '://' + url
    print(f'{url=}')
    return url

In [101]:
def fetch_site(url, captcha_provider=CAPTCHA_PROVIDER) -> str:
    url = get_url(url)
    scraper = cloudscraper.create_scraper(
        interpreter="nodejs",
        delay=LOAD_DELAY,
        browser={
            "browser": "chrome",
            "platform": "windows",
            "desktop": True,
        },
        captcha=captcha_provider,
    )

    response = scraper.get(url)
    print(f'{response.status_code=}')

    if response.status_code == 200:
        # Return the HTML content of the webpage
        return response.content
    else:
        # Print an error message if the request fails
        print(f"Failed to fetch URL: {url}. Status code: {response.status_code}")
        return None

def get_title(soup):
    title = soup.find('meta', {'property': 'og:title'})
    if title:
        title = title['content']
    else:
        title = soup.find('title').string if soup.title else None
    return title    


def get_description(soup):
    description = soup.find('meta', {'property': 'og:description'})
    if description:
        description = description['content']
    else:
        description = soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else None
    return description


def fetch_meta_from_page_content(content):
    if content is None:
        return None
    
    soup = BeautifulSoup(content, 'html.parser')
    head = soup.head

    title = get_title(head)
    description = get_description(head)

    print(f'{title=} {description=}')
    return dict(title=title, description=description)


def fetch_meta(url):
    content = fetch_site(url)
    meta = fetch_meta_from_page_content(content)
    print(f'{url=} {meta=}')
    return meta

In [102]:
diamonds = 'amauryjewelry.com'

In [104]:
fetch_meta(diamonds)

url='https://amauryjewelry.com'
response.status_code=200
title='Amaury' description='Amaury'
url='amauryjewelry.com' meta={'title': 'Amaury', 'description': 'Amaury'}


{'title': 'Amaury', 'description': 'Amaury'}

In [106]:
for url in URLS[:6]:
    if ';' in url:
        url = url.split(';')[0]
    fetch_meta(url)

url='https://djbet888.com'
response.status_code=200
title='综合平台官方直营_十年信誉' description=None
url='djbet888.com' meta={'title': '综合平台官方直营_十年信誉', 'description': None}
url='https://djbet9.co'
response.status_code=200
title='综合平台官方直营_十年信誉' description=None
url='djbet9.co' meta={'title': '综合平台官方直营_十年信誉', 'description': None}
url='https://djbet9.com'
response.status_code=200
title='综合平台官方直营_十年信誉' description=None
url='djbet9.com' meta={'title': '综合平台官方直营_十年信誉', 'description': None}
url='https://djbet9.net'
response.status_code=200
title='综合平台官方直营_十年信誉' description=None
url='djbet9.net' meta={'title': '综合平台官方直营_十年信誉', 'description': None}
url='https://djbpattaya.com'
response.status_code=200
title='D Varee Jomtien Beach, Pattaya' description='The best 4-star hotel in Jomtien Beach, Pattaya. Book direct a get 30% off from the best available ocean view room rate. D Varee Jomtien Beach, Pattaya'
url='djbpattaya.com' meta={'title': 'D Varee Jomtien Beach, Pattaya', 'description': 'The best 4-star h

In [38]:
def fetch_meta_mp(urls, n_jobs=2):
    p = mp.Pool(n_jobs)
    result = p.map(fetch_meta, urls)
    print(result)

In [107]:
fetch_meta_mp(URLS[:2])

Process SpawnPoolWorker-6:
Process SpawnPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/Users/wqqco/miniconda3/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPic

KeyboardInterrupt: 

In [3]:
!pip install pyperclip

Collecting pyperclip
  Using cached pyperclip-1.8.2-py3-none-any.whl
Installing collected packages: pyperclip
Successfully installed pyperclip-1.8.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import pyperclip as cb
import json

In [10]:
d = ['{"title": "\\u7efc\\u5408\\u5e73\\u53f0\\u5b98\\u65b9\\u76f4\\u8425_\\u5341\\u5e74\\u4fe1\\u8a89", "description": null, "url": "djbet888.com"}', '{"title": "\\u7efc\\u5408\\u5e73\\u53f0\\u5b98\\u65b9\\u76f4\\u8425_\\u5341\\u5e74\\u4fe1\\u8a89", "description": null, "url": "djbet9.co"}', '{"title": "\\u7efc\\u5408\\u5e73\\u53f0\\u5b98\\u65b9\\u76f4\\u8425_\\u5341\\u5e74\\u4fe1\\u8a89", "description": null, "url": "djbet9.com"}', '{"title": "\\u7efc\\u5408\\u5e73\\u53f0\\u5b98\\u65b9\\u76f4\\u8425_\\u5341\\u5e74\\u4fe1\\u8a89", "description": null, "url": "djbet9.net"}', '{"title": "BONAFIT88 BANDAR SLOT ONLINE | AGEN TOGEL ONLINE", "description": "BONAFIT88 Merupakan Sebuah Situs judi Online Terpercaya Di Tahun Ini yang menyediakan banyak games seperti togel online, live games, games slot dan game lainnya ", "url": "novawebmaroc.com.djbonafit.com"}', '{"title": "D Varee Jomtien Beach, Pattaya", "description": "The best 4-star hotel in Jomtien Beach, Pattaya. Book direct a get 30% off from the best available ocean view room rate. D Varee Jomtien Beach, Pattaya", "url": "djbpattaya.com"}', '{"title": "Chase Entertainment, Professional DJ & Photobooth Service in NY & PA", "description": "Chase Entertainment, Professional DJ\'s & Photobooth\'s in the Buffalo, Rochester, Syracuse New York areas, also servicing Erie PA.  Trust your special event to us! Our passion and ability to rock the dance floor sets us apart from everyone else! Contact us today to book your next event!", "url": "djbrandonchase.com"}', '{"title": "We are currently working on our website", "description": null, "url": "djbrandonchase.net"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djburnz.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16c14a0d0>: Failed to resolve \'djburnz.com\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djburnz.com"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djc200.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16c500110>: Failed to resolve \'djc200.com\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djc200.com"}', '{"error": "ConnectionError (\'Connection aborted.\', ConnectionResetError(54, \'Connection reset by peer\'))", "url": "djca.co.nz"}', '{"title": "DJCA | Everything your accountant isn\\u2019t | London Accountants", "description": "Increase your profits, reduce your costs & grow your business with your tech-powered, outsourced finance team.", "url": "djca.co.uk"}', '{"title": "Casamentos | S\\u00e3o Bernardo do Campo | DJ Casamento ABC", "description": "DJ para Casamento ABC tem de tudo para fazer uma festa de casamento inesquec\\u00edvel. Som e Ilumina\\u00e7\\u00e3o, tel\\u00e3o, festa de 15 anos, Retrospectivas, Sal\\u00e3o de Festas.", "url": "djcasamentoabc.com.br"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djcastro.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16fd93c10>: Failed to resolve \'djcastro.com\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djcastro.com"}', '{"title": "DJCO", "description": null, "url": "djco.com.au"}', '{"error": "ConnectionError (\'Connection aborted.\', ConnectionResetError(54, \'Connection reset by peer\'))", "url": "djcombo.com"}', '{"error": "ConnectionError (\'Connection aborted.\', ConnectionResetError(54, \'Connection reset by peer\'))", "url": "djcommerce.com"}', '{"error": "ConnectionError (\'Connection aborted.\', ConnectionResetError(54, \'Connection reset by peer\'))", "url": "djcpo.com"}', {'status_code': 403, 'url': 'https://djcre.com'}, '{"title": "Djcrusher | Dj Services | Montreal, QC, Canada", "description": null, "url": "djcrusher.net"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djcrusherr.kred\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16cc79510>: Failed to resolve \'djcrusherr.kred\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djcrusherr.kred"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djd985.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16cc8f3d0>: Failed to resolve \'djd985.com\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djd985.com"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djdj685.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16cc7bd90>: Failed to resolve \'djdj685.com\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djdj685.com"}', '{"error": "ConnectionError HTTPSConnectionPool(host=\'djdj868.com\', port=443): Max retries exceeded with url: / (Caused by NameResolutionError(\\"<urllib3.connection.HTTPSConnection object at 0x16cc75d50>: Failed to resolve \'djdj868.com\' ([Errno 8] nodename nor servname provided, or not known)\\"))", "url": "djdj868.com"}', {'status_code': 506, 'url': 'http://djdl666.app'}, '{"title": "\\u5927\\u5409\\u5927\\u5229", "description": "", "url": "djdl666.com"}', {'status_code': 506, 'url': 'http://djdl666.net'}, {'status_code': 506, 'url': 'http://djdl666.online'}, '{"title": "\\u5927\\u5409\\u5927\\u5229", "description": "", "url": "djdl777.app"}', '{"title": "\\u5927\\u5409\\u5927\\u5229", "description": "", "url": "djdl777.club"}']

In [14]:
for i in d:
    print(i)
    json.loads(i)

{"title": "\u7efc\u5408\u5e73\u53f0\u5b98\u65b9\u76f4\u8425_\u5341\u5e74\u4fe1\u8a89", "description": null, "url": "djbet888.com"}
{"title": "\u7efc\u5408\u5e73\u53f0\u5b98\u65b9\u76f4\u8425_\u5341\u5e74\u4fe1\u8a89", "description": null, "url": "djbet9.co"}
{"title": "\u7efc\u5408\u5e73\u53f0\u5b98\u65b9\u76f4\u8425_\u5341\u5e74\u4fe1\u8a89", "description": null, "url": "djbet9.com"}
{"title": "\u7efc\u5408\u5e73\u53f0\u5b98\u65b9\u76f4\u8425_\u5341\u5e74\u4fe1\u8a89", "description": null, "url": "djbet9.net"}
{"title": "BONAFIT88 BANDAR SLOT ONLINE | AGEN TOGEL ONLINE", "description": "BONAFIT88 Merupakan Sebuah Situs judi Online Terpercaya Di Tahun Ini yang menyediakan banyak games seperti togel online, live games, games slot dan game lainnya ", "url": "novawebmaroc.com.djbonafit.com"}
{"title": "D Varee Jomtien Beach, Pattaya", "description": "The best 4-star hotel in Jomtien Beach, Pattaya. Book direct a get 30% off from the best available ocean view room rate. D Varee Jomtien Bea

TypeError: the JSON object must be str, bytes or bytearray, not dict

In [3]:
import pandas as pd

In [4]:
proxies_csv = '/Users/wqqco/Downloads/Free Proxy List 1000+ IPs.txt'

In [5]:
df = pd.read_csv(proxies_csv, sep=',')
df.head()

Unnamed: 0,ip,anonymityLevel,asn,country,isp,latency,org,port,protocols,speed,upTime,upTimeSuccessCount,upTimeTryCount,updated_at,responseTime
0,203.13.32.47,elite,AS209242,CY,"Cloudflare London, LLC",2,Lachtaristo Holdings Limited,80,socks4,2,100,454,454,2024-06-15T20:46:17.596Z,1895
1,145.239.2.102,elite,AS16276,DE,OVH SAS,83,OVH GmbH,22813,socks4,1,100,553,553,2024-06-15T20:46:16.499Z,1093
2,5.135.137.13,elite,AS16276,FR,OVH SAS,5,OVH SAS,59124,socks4,1,100,490,490,2024-06-15T20:46:13.102Z,1210
3,199.60.103.126,elite,AS209242,US,"Cloudflare London, LLC",2,"HubSpot, Inc.",80,socks4,5,100,7324,7324,2024-06-15T20:46:04.190Z,3206
4,92.205.108.94,elite,AS21499,FR,Host Europe GmbH,11,GCN SXB1,58495,socks4,5,100,588,588,2024-06-15T20:45:43.300Z,879


In [9]:
proxy_list = (df.protocols + "://" + df.ip + ":" + df.port.astype(str)).tolist()

In [10]:
with open('proxies_example.txt', 'w') as f:
    f.write('\n'.join(proxy_list))