# <center> Web scraping: pandas website <center><a class="tocSkip">

# Introduction

What this file does:
> Scrapes information about functions and what they do from the Pandas website, and exports it into a CSV file.

In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Extract HTML

Read a single webpage:

In [2]:
link = 'https://pandas.pydata.org/docs/reference/io.html'
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get(link)
print(driver.title)
page_source = driver.page_source
driver.quit()

Input/output — pandas 2.0.1 documentation


Read multiple webpages:

In [3]:
links = ['https://pandas.pydata.org/docs/reference/io.html',
         'https://pandas.pydata.org/docs/reference/general_functions.html',
         'https://pandas.pydata.org/docs/reference/series.html',
         'https://pandas.pydata.org/docs/reference/frame.html',
         'https://pandas.pydata.org/docs/reference/indexing.html',
         'https://pandas.pydata.org/docs/reference/offset_frequency.html',
         'https://pandas.pydata.org/docs/reference/window.html',
         'https://pandas.pydata.org/docs/reference/groupby.html',
         'https://pandas.pydata.org/docs/reference/resampling.html',
         'https://pandas.pydata.org/docs/reference/style.html',
         'https://pandas.pydata.org/docs/reference/plotting.html',
         'https://pandas.pydata.org/docs/reference/options.html',
         'https://pandas.pydata.org/docs/reference/extensions.html',
         'https://pandas.pydata.org/docs/reference/testing.html']
page_sources = []
for i in range(len(links)):
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
    driver.get(links[i])
    print(driver.title)
    page_sources.append(driver.page_source)
    driver.quit()

Input/output — pandas 2.0.1 documentation
General functions — pandas 2.0.1 documentation
Series — pandas 2.0.1 documentation
DataFrame — pandas 2.0.1 documentation
Index objects — pandas 2.0.1 documentation
Date offsets — pandas 2.0.1 documentation
Window — pandas 2.0.1 documentation
GroupBy — pandas 2.0.1 documentation
Resampling — pandas 2.0.1 documentation
Style — pandas 2.0.1 documentation
Plotting — pandas 2.0.1 documentation
Options and settings — pandas 2.0.1 documentation
Extensions — pandas 2.0.1 documentation
Testing — pandas 2.0.1 documentation


Number of webpages:

In [4]:
len(page_sources)

14

# HTML parsing

Here, we gather some insight into how the data is structured, before constructing our final table.

## Show page source

Commented out due to size:

In [15]:
soup = BeautifulSoup(page_source, 'html.parser')
# print(soup.prettify())

## Get headers

Get h1 header:

In [6]:
header_h1 = soup.find('h1').get_text()[:-1]
header_h1

'Input/output'

Get h2 headers:

In [7]:
headers_h2 = soup.find_all('h2')
headers_h2 = [x.get_text()[:-1] for x in headers_h2]
headers_h2

['Pickling',
 'Flat file',
 'Clipboard',
 'Excel',
 'JSON',
 'HTML',
 'XML',
 'Latex',
 'HDFStore: PyTables (HDF5)',
 'Feather',
 'Parquet',
 'ORC',
 'SAS',
 'SPSS',
 'SQL',
 'Google BigQuery',
 'STATA']

## Reading in html tables

In [8]:
html_tables = pd.read_html(link)
len(html_tables)

23

In [9]:
html_tables[-2]

Unnamed: 0,0,1
0,"read_stata(filepath_or_buffer, *[, ...])",Read Stata file into DataFrame.
1,"DataFrame.to_stata(path, *[, convert_dates, ...])",Export DataFrame object to Stata dta format.


## Construct our DataFrame

In [10]:
df = pd.DataFrame()
for i in range(len(page_sources)):
    soup = BeautifulSoup(page_sources[i], 'html.parser')
    header_h1 = soup.find('h1').get_text()[:-1]
    html_tables = pd.read_html(links[i])
    df_one = pd.DataFrame()
    for j in range(len(html_tables)):
        df_one = pd.concat([df_one, html_tables[j]])
    df_one['Category'] = header_h1
    df = pd.concat([df, df_one])
df

Unnamed: 0,0,1,Category,Data Type,Accessor
0,"read_pickle(filepath_or_buffer[, ...])",Load pickled pandas object (or any object) fro...,Input/output,,
1,"DataFrame.to_pickle(path[, compression, ...])",Pickle (serialize) object to file.,Input/output,,
0,"read_table(filepath_or_buffer, *[, sep, ...])",Read general delimited file into DataFrame.,Input/output,,
1,"read_csv(filepath_or_buffer, *[, sep, ...])",Read a comma-separated values (csv) file into ...,Input/output,,
2,"DataFrame.to_csv([path_or_buf, sep, na_rep, ...])",Write object to a comma-separated values (csv)...,Input/output,,
...,...,...,...,...,...
39,errors.UnsortedIndexError,Error raised when slicing a MultiIndex which h...,Testing,,
40,errors.UnsupportedFunctionCall,Exception raised when attempting to call a uns...,Testing,,
41,errors.ValueLabelTypeMismatch,Warning raised by to_stata on a category colum...,Testing,,
0,show_versions([as_json]),"Provide useful information, important for bug ...",Testing,,


## Data wrangling

Rename columns:

In [11]:
columns_rename = {0: 'Function', 1: 'Description'}
df.rename(columns=columns_rename, inplace=True)
df

Unnamed: 0,Function,Description,Category,Data Type,Accessor
0,"read_pickle(filepath_or_buffer[, ...])",Load pickled pandas object (or any object) fro...,Input/output,,
1,"DataFrame.to_pickle(path[, compression, ...])",Pickle (serialize) object to file.,Input/output,,
0,"read_table(filepath_or_buffer, *[, sep, ...])",Read general delimited file into DataFrame.,Input/output,,
1,"read_csv(filepath_or_buffer, *[, sep, ...])",Read a comma-separated values (csv) file into ...,Input/output,,
2,"DataFrame.to_csv([path_or_buf, sep, na_rep, ...])",Write object to a comma-separated values (csv)...,Input/output,,
...,...,...,...,...,...
39,errors.UnsortedIndexError,Error raised when slicing a MultiIndex which h...,Testing,,
40,errors.UnsupportedFunctionCall,Exception raised when attempting to call a uns...,Testing,,
41,errors.ValueLabelTypeMismatch,Warning raised by to_stata on a category colum...,Testing,,
0,show_versions([as_json]),"Provide useful information, important for bug ...",Testing,,


Format the Function column to remove stuff inside brackets:

In [12]:
df['Function'] = df['Function'].str.replace(r'(?<=\()[^()]*(?=\))', '')
df['Function']

  df['Function'] = df['Function'].str.replace(r'(?<=\()[^()]*(?=\))', '')


0                      read_pickle()
1              DataFrame.to_pickle()
0                       read_table()
1                         read_csv()
2                 DataFrame.to_csv()
                   ...              
39         errors.UnsortedIndexError
40    errors.UnsupportedFunctionCall
41     errors.ValueLabelTypeMismatch
0                    show_versions()
0                             test()
Name: Function, Length: 1868, dtype: object

# Exporting

In [13]:
filename = 'pandas functions.csv'
df.to_csv(filename)