In [1]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

In [9]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [12]:
html=simple_get('https://realpython.com/blog/')

In [13]:
len(html)

591275

In [14]:
libgen=simple_get('https://libgen.is/')

In [15]:
len(libgen)

46196

In [16]:
yt=simple_get('https://www.youtube.com/')

In [17]:
len(yt)

303189

In [19]:
raw_html = simple_get('https://libgen.is/')
html = BeautifulSoup(raw_html, 'html.parser')
for i, li in enumerate(html.select('li')):
        print(i, li.text)

0 RU

1 FORUM


Sitemap
Error report



2 DOWNLOAD


Mirrors
Gen.lib.rus.ec
Libgen.lc
Libgen.pw
Z-Library
BookFI.net


P2P
Torrents
Usenet (*.nzb)
Database Dumps
gen.lib.rus.ec
libgen.lc


Other
Books catalog (XLS)
Source (PHP)
Import local files in LG format
Libgen Desktop application



3 UPLOAD



Libgen uploader
Fiction uploader
Scientific articles uploader
FTP

						(Login:password look at the forum sitemap)
					



4 LAST



Last added
Last modified
RSS
API




5 OTHERS



Comics
Fiction
Magazines
Standarts
Full-text search in LG content




6 TOPICS



Technology



Aerospace Equipment
Automation
Communication: Telecommunications
Communication
Construction
Construction: Cement Industry
Construction: Renovation and interior design: Saunas
Construction: Renovation and interior design


Construction: Ventilation and Air Conditioning
Electronics: Electronics
Electronics: Fiber Optics
Electronics: Hardware
Electronics: Home Electronics
Electronics: Microprocessor Technology
Electro

In [32]:
def get_names():
    """
    Downloads the page where the list of mathematicians is found
    and returns a list of strings, one per mathematician
    """
    url = 'http://www.fabpedigree.com/james/mathmen.htm'
    response = simple_get('https://libgen.is/')

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        names = set()
        for li in html.select('li'):
            for name in li.text.split('\n'):
                if len(name) > 0:
                    names.add(name.strip())
        return list(names)

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))

In [33]:
get_names()

['',
 'Microbiology',
 'Construction',
 'Metrology',
 'Rhetoric',
 'UPLOAD',
 'Materials',
 'Elementary',
 'Probability',
 'Surgery, Orthopedics',
 'Scientific articles uploader',
 'Crystal Physics',
 'Molecular: Bioinformatics',
 'Prose',
 'Information Systems',
 'Fuzzy Logic and Applications',
 'Game Theory',
 'Usenet (*.nzb)',
 'Diabetes',
 'Games: Board Games',
 'Zoology:Paleontology',
 'Food Manufacturing',
 'Cultural',
 'TOPICS',
 'Logistics',
 'Estestvoznananie',
 'Physical Educ. and Sport',
 'Plasma Physics',
 'Software: Office software',
 'Folklore',
 'Aerospace Equipment',
 'Mechanics: Nonlinear dynamics and chaos',
 'Algorithms and Data Structures',
 'Biostatistics',
 'Patent Business. Ingenuity. Innovation',
 'Cybernetics',
 'Physics',
 'libgen.lc',
 'Discrete Mathematics',
 'Books catalog (XLS)',
 'Management',
 'Music',
 'Philosophy',
 'Homeopathy',
 'Therapy',
 'Optics',
 'Mechanics',
 'Molecular Medicine',
 'Other',
 'Mathematical Economics',
 'Algorithms and Data Struc