In [21]:
# TODO
# When invoking the class, we want to scrape a set number of ip proxies and devices to use for un-detectable scraping
import pandas as pd
import requests
from bs4 import BeautifulSoup as soup
import random

In [32]:
class webScraper:
    '''
    Given a csv file of user agents and url or proxy site, this will create a web-scraper that will use
    these details to mask your details when scraping given sites
    '''
    device_list = []
    proxy_site = ""
    address_list = []
    
    def __init__(self, csv_path, proxy_site):
        """
        Supply csv of User Agents and Proxy website
        This will then initialise the class, setting up each for use in masking.
        """
        self.proxy_site = proxy_site
        device_df = pd.read_csv(csv_path)
        print("CSV Read.")
        self.device_list = [str(x) for x in device_df["user-agent"].tolist()]
        print("Initialisation Complete.")
        
    def get_html_response(self, url):
        """
        Give url to scrape, this will return the json response and the html parsed page response generated from
        beautiful soup. Uses proxy and user agent details to mask details.
        """
        if len(self.address_list) == 0:
            print("Updating Proxy List")
            self.update_live_address_list()
        
        # Get proxy and user agent to use for this request
        proxy = self.get_random_proxy()
        user_agent = self.get_random_user_agent()
            
        print("Getting HTML Response...")
        print("Proxy: {}".format(proxy))
        print("User Agent: {}".format(user_agent))
        
        response_json = ""
        page = ""
        while page == "":
            # Use proxy and user agent to get page
            try:
                # If proxy fails connection, remove from list
                response = requests.get(url,proxies={"http": str("http://" + proxy),"https": str("http://" + proxy)})
            except:
                print("Skipping, Connection Failed.")
                proxy = self.replace_proxy(proxy)
                print("New Proxy selected")
        # Get responses to return
        response_json = response.json()
        page = soup(response.content, "html.parser")
        return response_json, page
    
    def replace_proxy(self,proxy):
        """
        Removes faulty proxy from proxy list, and gives new proxy.
        Also gets a new set of proxies if the list has become empty.
        """
        old_list = self.address_list
        old_list.remove(proxy)
        new_list = old_list
        self.address_list = new_list
        if len(new_list) == 0:
            self.update_live_address_list()
        else:
            proxy = self.get_random_proxy()
        return proxy
        
    def update_live_address_list(self):
        user_agent = self.get_random_user_agent()
        response = requests.get(self.proxy_site,headers={'User-Agent':"Custom"})
        page = soup(response.content, "html.parser")
        table_data = page.findAll("tr")
        ip_addresses = []
        for i in range(1,51):
            row_data = table_data[i].findAll("td")
            proxy_type = str(row_data[4].text)
            https = str(row_data[6].text)
            if proxy_type == "elite proxy" and https == "yes":
                ip_address = str(row_data[0].text)
                port = str(row_data[1].text)
                final_address = ip_address + ":" + port
                ip_addresses.append(final_address)
        print("IP Addresses: {}".format(ip_addresses))
        self.address_list = ip_addresses
      
    def get_random_proxy(self):
        return random.choice(self.address_list)
    
    def get_random_user_agent(self):
        return random.choice(self.device_list)