In [1]:
from bs4 import BeautifulSoup
from time import sleep
from requests_tor import RequestsTor
from functools import partial
from multiprocessing import Process, Manager

In [2]:
HEADERS = {
    "Accept": "*",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
}
TOR_PORTS = {
    "main": (9050,),
    "controller": 9051,
    "others": tuple(range(9000, 9010)) #10 ports are needed since the maximum results number on a page is 10
}

In [3]:
def main():
    rt = RequestsTor(tor_ports=TOR_PORTS["main"], tor_cport=TOR_PORTS["controller"], autochange_id=0)
    rt.new_ip = partial(new_ip, rt)
    rt.new_ip()
    print(scrape(query_builder("ciao"), rt))

In [4]:
def new_ip(self):
    '''Method used to make sure new_id gets a new exit node'''
    ip = self.check_ip()
    while(ip == self.check_ip()):
        sleep(1)
        self.new_id()

In [5]:
class ResultScraper(Process):
    '''Class for scraping single results, using multithreading'''
    def __init__(self, data: list, result: BeautifulSoup, tor_port: int):
        super(ResultScraper, self).__init__()
        self.result = result
        self.data = data
        self.rt = RequestsTor(tor_ports=(tor_port,),  tor_cport=TOR_PORTS["controller"], autochange_id=0)
        self.rt.new_ip = partial(new_ip, self.rt)
        
    def run(self):
        self.rt.new_ip()
        while True:
            if (no := self.result.find(class_="gs_ggs gs_fl")):
                no.extract()
            try:
                tmp = self.rt.get(BeautifulSoup(self.rt.get(f"https://scholar.google.com/scholar?q=info:{self.result.get('data-cid')}:scholar.google.com/&output=cite", headers=HEADERS).content, "html.parser").find("a", class_="gs_citi").get("href"), headers=HEADERS).content.decode()
                assert "html" not in tmp, "IP Blocked"
            except Exception as e:
                self.rt.new_ip()
                continue
            break
        self.data.append(tmp)    

In [6]:
def query_builder(basic_search: str = None, includes_all: str = None, exact_sentence: str = None, includes_one: str = None, 
                  exclude: str = None, in_title: bool = None, language: str = None, interval: tuple = None, 
                  only_science: bool = None, include_patents: bool = None, include_citations: bool = None, sort_by_date: bool = None, 
                  authors: list = None, sources: list = None, page: int = None)-> str:
    '''Create search queries for Google Scholar'''
    assert bool(includes_all or exact_sentence or includes_one) ^ bool(basic_search and not (authors or sources or in_title)), "Invalid search query." 
    return "https://scholar.google.com/scholar?" + \
        (f"&q={basic_search.replace(' ', '+')}" if basic_search else "") + \
        (f"&as_q={includes_all.replace(' ', '+')}" if includes_all else "") + \
        (f"&as_epq={exact_sentence.replace(' ', '+')}" if exact_sentence else "") + \
        (f"&as_oq={includes_one.replace(' ', '+')}" if includes_one else "") + \
        (f"&as_eq={exclude.replace(' ', '+')}" if exclude else "") + \
        (f"&as_occt=title" if in_title else "") + \
        (f"""&as_sauthors={'+'.join([f'"{a.replace(" ", "+")}"' for a in authors])}""" if authors else "") + \
        (f"""&as_publication={'+'.join([f'"{s.replace(" ", "+")}"' for s in sources])}""" if sources else "") + \
        (f"&lr=lang_{language}?hl={language}" if language else "") + \
        (((f"&as_ylo={interval[0]}" + \
            (f"&as_yhi={interval[1]}" if len(interval) == 2 else ""))) if interval else "") + \
        ("&as_rr=1" if only_science else "") + \
        ("&as_sdt=2007" if include_patents else "") + \
        ("&as_vis=1" if include_citations else "") + \
        ("&scisbd=1" if sort_by_date else "") + \
        (f"&start={(page-1)*10}" if page else "")

In [7]:
def scrape(search: str, rt: RequestsTor) -> list:
    '''Scrapes data from given search query'''
    while True:
        try:
            soup = BeautifulSoup(rt.get(search, headers=HEADERS).content, "html.parser")
            soup.style.decompose()
            soup.br.decompose()
            results = soup.find_all("div", class_ ="gs_r gs_or gs_scl")
        except Exception as e:
            rt.new_ip()
            continue
        break
    with Manager() as manager:
        data = manager.list([])
        processes = [ResultScraper(data, results[i], TOR_PORTS["others"][i]) for i in range(len(results))]
        for proc in processes:
            proc.start()
        for proc in processes:
            proc.join()
        return list(data)

In [8]:
if __name__== "__main__":
    main()

['@article{ciao2011reducing,\n  title={Reducing obesity stigma: The effectiveness of cognitive dissonance and social consensus interventions},\n  author={Ciao, Anna C and Latner, Janet D},\n  journal={Obesity},\n  volume={19},\n  number={9},\n  pages={1768--1774},\n  year={2011},\n  publisher={Wiley Online Library}\n}\n', '@inproceedings{hong2021learning,\n  title={Learning by fixing: Solving math word problems with weak supervision},\n  author={Hong, Yining and Li, Qing and Ciao, Daniel and Huang, Siyuan and Zhu, Song-Chun},\n  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},\n  volume={35},\n  number={6},\n  pages={4959--4967},\n  year={2021}\n}\n', "@article{danielsdottir2010anti,\n  title={Anti-fat prejudice reduction: a review of published studies},\n  author={Dan{\\'\\i}elsd{\\'o}ttir, Sigr{\\'e}n and O'Brien, Kerry S and Ciao, Anna},\n  journal={Obesity facts},\n  volume={3},\n  number={1},\n  pages={47--58},\n  year={2010},\n  publisher={Karger Publish