In [1]:
from lxml import etree
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import random

# 使用https://free-proxy-list.net
## Method 1. 快速使用pandas read_clipboard()來抓取proxy

In [None]:
df = pd.read_clipboard()

In [None]:
df['Port'] = df['Port'].astype('str')

In [None]:
df['Proxy'] = df['IP Address'] + ':' + df['Port']

In [None]:
df['Proxy'].tolist()

## Method 2. 使用爬蟲抓取

### 測試proxy是否可用可以用httpbin.org/ip

In [8]:
proxies = {
    'https': '159.65.69.186:9300',
    'http': '159.65.69.186:9300'
}
url = 'https://httpbin.org/ip' # 可以測試自己的ip
r = requests.get(url, proxies=proxies, timeout=1)
r.json()

ConnectTimeout: HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fe3c07cf820>, 'Connection to 159.65.69.186 timed out. (connect timeout=1)'))

### 建立可以抓取proxy list的function

In [2]:
def getProxies():
    url = 'https://free-proxy-list.net'
    r = requests.get(url)
    # r.status_code # 測試連線狀態
    soup = bs(r.content, 'html.parser')
    table = soup.find('tbody')
    proxies = []
    for row in table:
        if row.find_all('td')[4].text == 'elite proxy':
            proxy = ':'.join([row.find_all('td')[0].text, row.find_all('td')[1].text])
            proxies.append(proxy)
        else:
            pass
    return proxies




In [3]:
proxies = getProxies()

In [None]:
len(proxies)

## <font color='red'>找出可以使用的proxy</font>

### 方法一：使用for loop一個一個去試proxy，但是速度很慢，很浪費時間。

In [None]:
def extract(proxies):
    working_proxies = []
    for proxy in proxies:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0'}
        try:
            r = requests.get('https://httpbin.org/ip', headers=headers, proxies={'http': proxy, 'https': proxy}, timeout=1)
    #         print(r.json(), r.status_code)
            working_proxies.append(proxy)
        except:
            pass
    return working_proxies

In [None]:
pro_list = extract(proxies)

In [None]:
len(pro_list)

### 方法二：使用concurrent.futures來map
**<語法>：**
```python
with concucurrent.futures.ThreadPoolExcuter() as excutor:
    excutor.map(function_name, proxylist)
```

In [4]:
import concurrent.futures
import timeit

In [5]:
def extract_proxy(proxy):
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'}
    try:
        r = requests.get('https://httpbin.org/ip', headers=headers, proxies={'http': proxy, 'https': proxy}, timeout=1)
    except:
        pass
    return proxy

In [6]:
with concurrent.futures.ThreadPoolExecutor() as extractor:
    results = extractor.map(extract_proxy, proxies)
    
pro_list = [result for result in results]

In [None]:
len(pro_list)

## 隨機抽取可用ip發送requests

In [None]:
def random_proxy(working_proxy):
    while True:
        try:
            proxy = random.choice(working_proxy)
            r = requests.get('https://httpbin.org/ip', proxies={'http': proxy, 'https': proxy}, timeout=1)
#             print(r.json())
            return proxy
            break
        except:
            pass

In [None]:
test = random_proxy(pro_list)

In [None]:
test

## 知識補充

In [None]:
x = '1'
y = '4'
':'.join([x, y]) # .join()需要在前面先加上要用來join的str，然後argument只接受list形式傳入

In [None]:
response.json() # 可以顯示origin of ip
response.status_code # 可以顯示連線狀態

In [None]:
url = "https://www.mouser.tw/Passive-Components/Inductors-Chokes-Coils/_/N-5gb4?No=0"

In [None]:
my_headers = ["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]

In [None]:
while True:
    try:
        headers = {
        "user-agent": random.choice(my_headers),
        "upgrade-insecure-requests": "1", "referer": "www.malico.com.tw",
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-TW,zh;q=0.9,ru;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'}
        r = requests.get(url, headers=headers, proxies={'http': random.choice(pro_list), 'https': random.choice(pro_list)})
        soup = bs(r.content, 'html.parser')
        print(soup)
        break
    except:
        continue

In [None]:
requests.get('https://httpbin.org/ip', proxies={'http': random.choice(x), 'https': random.choice(x)}).json()