In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
import numpy as np

In [2]:
def get_link_for_page(page_num: int, q: str):
    return f'''https://scholar.google.com/scholar?start={10*page_num}&q={'+'.join(q.split())}&hl=pl&as_sdt=0,5'''

In [3]:
def get_book_sections(soup: BeautifulSoup):
    return soup.find_all('div', class_='gs_r gs_or gs_scl')

In [4]:
def get_book_title(book):
    return book.find('h3').find('a').text

In [5]:
def get_is_book(book):
    return book.find('span').text

In [6]:
def get_book_link(book):
    return book.find('h3').find('a', href=True)['href']

In [7]:
def get_citation_count(book):
    return book.find_all('div', class_='gs_fl')[-1].find_all('a', href=True)[2].text.split(' ')[-1]

In [8]:
def get_all_needed_elements(book_section):
        try:
            title = get_book_title(book_section)
            is_book = get_is_book(book_section)
            if '[KSIĄŻKA]' in is_book:
                return [None]*3
            article_link = get_book_link(book_section)
            citation_count = get_citation_count(book_section)
            return [title, article_link, citation_count]
        except AttributeError:
            return [None]*3

In [9]:
browser = webdriver.Chrome()

In [10]:
lst = []

In [11]:
for i in range(30):
    browser.get(get_link_for_page(i, 'machine learning'))
    soup = BeautifulSoup(browser.page_source, 'lxml')
    book_sections = get_book_sections(soup)
    lst.append([get_all_needed_elements(book_section) for book_section in book_sections])
    time.sleep(3)
df = pd.DataFrame(np.array(lst).reshape(-1, 3), columns=['title', 'link', 'citations'])
df.dropna()

Unnamed: 0,title,link,citations
0,Machine learning,https://www.annualreviews.org/doi/pdf/10.1146/...,211
2,Quantum machine learning,https://www.nature.com/articles/nature23474,2296
3,Machine learning algorithms-a review,https://www.researchgate.net/profile/Batta-Mah...,915
4,Machine learning,https://profs.info.uaic.ro/~ciortuz/SLIDES/201...,34730
5,Machine learning,https://books.google.com/books?hl=pl&lr=&id=2n...,227
...,...,...,...
295,Building machine learning systems with Python,https://books.google.com/books?hl=pl&lr=&id=C-...,187
296,(Machine) learning to do more with less,https://link.springer.com/article/10.1007/JHEP...,64
297,Using GPUs for machine learning algorithms,https://ieeexplore.ieee.org/abstract/document/...,327
298,MLC++: A machine learning library in C++,https://ieeexplore.ieee.org/abstract/document/...,324


In [12]:
df.to_csv('titles_links_and_citations.csv')