In [14]:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
from keybert import KeyBERT
from requests_tor import RequestsTor
from functools import partial
import bibtexparser
from multiprocessing import Process
from subprocess import run

In [15]:
# Author Kirelos Namroud, @knamroud on GitHub
# On 20/01/2023 

In [16]:
HEADERS = {
    "Accept": "*",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
}
TOR_PORTS = {
    "main": (9050,),
    "controller": 9051,
    "others": tuple(range(9000, 9010)) #10 ports are needed since the maximum results number on a page is 10
}

In [17]:
def main():
    rt = RequestsTor(tor_ports=TOR_PORTS["main"], tor_cport=TOR_PORTS["controller"], autochange_id=0)
    rt.new_ip = partial(new_ip, rt)
    rt.new_ip()
    data = scrape(query_builder("ciso"), rt)
    print(data)
    create_csv(data, "prova")

In [18]:
def new_ip(self):
    '''Method used to make sure new_id gets a new exit node'''
    ip = self.check_ip()
    while(ip == self.check_ip()):
        sleep(1)
        self.new_id()

In [19]:
class ResultScraper(Process):
    '''Class for scraping single results, using multithreading'''
    def __init__(self, result: BeautifulSoup, tor_port: int):
        super(ResultScraper, self).__init__()
        self.result = result
        self.data = None
        self.rt = RequestsTor(tor_ports=(tor_port,),  tor_cport=TOR_PORTS["controller"], autochange_id=0)
        self.rt.new_ip = partial(new_ip, self.rt)
        
    def run(self):
        self.rt.new_ip()
        while True:
            self.data = []
            if (no := self.result.find(class_="gs_ggs gs_fl")):
                no.extract()
            try:
                self.data.append(get_keywords(" ".join([content for content in self.result.find("div", class_="gs_rs").contents if isinstance(content, str)]).replace(" \n", "")))
                self.data.append(self.rt.get(BeautifulSoup(self.rt.get(f"https://scholar.google.com/scholar?q=info:{self.result.get('data-cid')}:scholar.google.com/&output=cite", headers=HEADERS).content, "html.parser").find("a", class_="gs_citi").get("href"), headers=HEADERS).content.decode())
                print(self.data)
                assert "html" not in self.data[1], "IP Blocked"
            except Exception as e:
                print(e)
                self.rt.new_ip()
                continue
            break

In [20]:
def query_builder(basic_search: str = None, includes_all: str = None, exact_sentence: str = None, includes_one: str = None, 
                  exclude: str = None, in_title: bool = None, language: str = None, interval: tuple = None, 
                  only_science: bool = None, include_patents: bool = None, include_citations: bool = None, sort_by_date: bool = None, 
                  authors: list = None, sources: list = None, page: int = None)-> str:
    '''Create search queries for Google Scholar'''
    assert bool(includes_all or exact_sentence or includes_one) ^ bool(basic_search and not (authors or sources or in_title)), "Invalid search query." 
    return "https://scholar.google.com/scholar?" + \
        (f"&q={basic_search.replace(' ', '+')}" if basic_search else "") + \
        (f"&as_q={includes_all.replace(' ', '+')}" if includes_all else "") + \
        (f"&as_epq={exact_sentence.replace(' ', '+')}" if exact_sentence else "") + \
        (f"&as_oq={includes_one.replace(' ', '+')}" if includes_one else "") + \
        (f"&as_eq={exclude.replace(' ', '+')}" if exclude else "") + \
        (f"&as_occt=title" if in_title else "") + \
        (f"""&as_sauthors={'+'.join([f'"{a.replace(" ", "+")}"' for a in authors])}""" if authors else "") + \
        (f"""&as_publication={'+'.join([f'"{s.replace(" ", "+")}"' for s in sources])}""" if sources else "") + \
        (f"&lr=lang_{language}?hl={language}" if language else "") + \
        (((f"&as_ylo={interval[0]}" + \
            (f"&as_yhi={interval[1]}" if len(interval) == 2 else ""))) if interval else "") + \
        ("&as_rr=1" if only_science else "") + \
        ("&as_sdt=2007" if include_patents else "") + \
        ("&as_vis=1" if include_citations else "") + \
        ("&scisbd=1" if sort_by_date else "") + \
        (f"&start={(page-1)*10}" if page else "")

In [21]:
def scrape(search: str, rt: RequestsTor) -> list:
    '''Scrapes data from given search query'''
    while True:
        try:
            soup = BeautifulSoup(rt.get(search, headers=HEADERS).content, "html.parser")
            soup.style.decompose()
            soup.br.decompose()
            results = soup.find_all("div", class_ ="gs_r gs_or gs_scl")
        except Exception as e:
            print(e)
            rt.new_ip()
            continue
        break
    threads = [ResultScraper(results[i], TOR_PORTS["others"][i]) for i in range(len(results))]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    data = [thread.data for thread in threads]
    return data

In [22]:
def get_keywords(text: str) -> str:
    '''Parse keywords from description of articles'''
    kw_model = KeyBERT()
    keywords = kw_model.extract_keywords(text)
    return ", ".join([kw[0] for kw in keywords])

In [23]:
def create_csv(data: list, file: str):
    '''Write data on a csv file'''
    data = {
        "Title": [], 
        "Author": [],
        "": [],
        
    }
    

In [24]:
if __name__== "__main__":
    main()

'NoneType' object has no attribute 'get'
['ceo, cio, obtain, ascertain, questions', '@article{fitzgerald2007clarifying,\n  title={Clarifying the roles of information security: 13 questions the CEO, CIO, and CISO must ask each other},\n  author={Fitzgerald, Todd},\n  journal={Information Systems Security},\n  volume={16},\n  number={5},\n  pages={257--263},\n  year={2007},\n  publisher={Taylor \\& Francis}\n}\n']
['cyber, companies, role, mitigate, exposure', '@article{zwilling2022trends,\n  title={Trends and Challenges Regarding Cyber Risk Mitigation by CISOs—A Systematic Literature and Experts’ Opinion Review Based on Text Analytics},\n  author={Zwilling, Moti},\n  journal={Sustainability},\n  volume={14},\n  number={3},\n  pages={1311},\n  year={2022},\n  publisher={MDPI}\n}\n']
['auerbach, fax, handbook, publications, order', '@book{gentile2016ciso,\n  title={The CISO Handbook: A Practical Guide to Securing Your Company},\n  author={Gentile, Michael and Collette, Ron and August, Tho