In [1]:
import requests
import re
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
import random
import numpy as np, pandas as pd
import seaborn as sns
import urllib.request
from tqdm import tqdm

In [30]:
# Test
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
response = requests.get(url=url)

soup = BeautifulSoup(response.content, 'html.parser')
all_links = soup.find(id="mw-normal-catlinks").find_all("a")
my_list = []
for link in all_links:
    cat = str(link["title"])
    if len(re.findall(r"Help:Category", cat)) > 0:
        continue
    cat = re.sub(r"^Category:", "", cat)
    my_list.append(cat)
print(my_list)
# des_num = len(soup.find_all("p"))
# print(des_num)
# description_unfiltered = str(soup.find_all("p")[0])
# description = re.sub(r"<.*?>", "", description_unfiltered)
# print(description_unfiltered)
# print(description)
# if description == str("\n"):
#     description_unfiltered = str(soup.find_all("p")[1])
#     description = re.sub(r"<.*?>", "", description_unfiltered)
#     print(description_unfiltered)
#     print(description)
    
# allLinks = soup.find(id="bodyContent").find_all("a")
# print(allLinks[0]["href"])

['Python (programming language)', 'Class-based programming languages', 'Notebook interface', 'Computer science in the Netherlands', 'Concurrent programming languages', 'Cross-platform free software', 'Cross-platform software', 'Dutch inventions', 'Dynamically typed programming languages', 'Educational programming languages', 'High-level programming languages', 'Information technology in the Netherlands', 'Multi-paradigm programming languages', 'Object-oriented programming languages', 'Pattern matching programming languages', 'Programming languages', 'Programming languages created in 1991', 'Scripting languages', 'Text-oriented programming languages']


# Scraper
This is the scraper. It outputs the scraped data as a pd.DataFrame.

In [2]:
Dont_include_in_links = r"User|File\:|wiki\/Wikipedia\:|Special\:|Help\:|\#|Template|Category\:|Wikidata\:"

def Scraper(url, data_frame):
    try:
        response = requests.get(url=url)
    except:
        sleep(5)
        print("Sleeping...")
        response = requests.get(url=url)

    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find(id="firstHeading")
    try:
        description_unfiltered = str(soup.find_all("p")[0])
        # Make description readable
        description = re.sub(r"<.*?>", "", description_unfiltered)
    except:
        print("Cannot find paragraph?")
        description = str(title.text)
    
    if description == str("\n"):
        try:
            description_unfiltered = str(soup.find_all("p")[1])
            description = re.sub(r"<.*?>", "", description_unfiltered)
        except:
            print("Cannot find paragraph??")
            description = title.text
            
    # Make categories
    cate_list = []
    try:
        cats = soup.find(id="mw-normal-catlinks").find_all("a")
        for cat in cats:
            cat = str(cat["title"])
            if len(re.findall(r"Help:Category", cat)) > 0:
                continue
            cat = re.sub(r"^Category:", "", cat)
            cate_list.append(cat)
    except:
        print("Error! No category found...")
            
    df_temp = pd.DataFrame([[title.text, description, url, cate_list]], columns=["Title", "Description", "Link", "Category"])
    data_result = pd.concat([data_frame, df_temp], ignore_index=True)

    # Randomly get a next link
    allLinks = soup.find(id="bodyContent").find_all("a")
    random.shuffle(allLinks)
    linkToScrape = 0
    
    for link in allLinks:
        # Only interested in other wiki articles
        try:
            if (len(re.findall(r"^\/wiki\/", link['href'])) < 1 or # r"^\/wiki\/Category" 
                len(re.findall(Dont_include_in_links, link['href'])) >= 1):
                continue
        except:
            print("Error! Check next link...")
            continue

        # Use this link to scrape
        linkToScrape = link
        break

    # The next link to scrape
    next_link = "https://en.wikipedia.org" + linkToScrape['href']
    # The link should be valid
    next_link = re.sub(r"^https.*https|^https.*http", "https", next_link)
    # print(title.text)
    # print(next_link)
    
    return data_result, next_link

If the notebook is shut down during scraping, data can be restored with the following cell.

In [35]:
df = pd.read_csv("data.csv", index_col=False)
df = df.iloc[0:20000]
display(df)

Unnamed: 0,Title,Description,Link,Category
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping,
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...,
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat,
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic,
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,
...,...,...,...,...
19995,Order Police battalions,The Order Police battalions were militarised f...,https://en.wikipedia.org/wiki/Order_Police_bat...,
19996,War crimes of the Wehrmacht,"During World War II, the German Wehrmacht (com...",https://en.wikipedia.org/wiki/War_crimes_of_th...,
19997,War crimes in the Gaza War (2008–2009),Accusations of violations regarding internatio...,https://en.wikipedia.org/wiki/War_crimes_in_th...,
19998,Richard Kemp,\n,https://en.wikipedia.org/wiki/Richard_Kemp,


We first scraped data without categories, but categories turned out to be useful. Therefore the following cells are created.

In [36]:
df_with_cat = df.copy(deep=True)
display(df_with_cat.iloc[0:3500])

Unnamed: 0,Title,Description,Link,Category
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping,
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...,
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat,
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic,
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,
...,...,...,...,...
3495,ISO 31-10,\nISO 31-10 is the part of international stand...,https://en.wikipedia.org/wiki/ISO_31-10,
3496,ISO 15924,"ISO 15924, Codes for the representation of nam...",https://en.wikipedia.org/wiki/ISO_15924,
3497,Armenian alphabet,"The Armenian alphabet (Armenian: Հայոց գրեր, H...",https://en.wikipedia.org/wiki/Armenian_alphabet,
3498,Saurashtra script,The Saurashtra script is an abugida script tha...,https://en.wikipedia.org/wiki/Saurashtra_script,


In [66]:
# df_with_cat = df.copy(deep=True)
for i in tqdm(range(14000, 16000)): # df.shape[0]
    link = df.iloc[i, 2]
    try:
        response = requests.get(url=link)
    except:
        sleep(5)
        print("Sleeping...")
        response = requests.get(url=link)
        
    soup = BeautifulSoup(response.content, 'html.parser')
    # Make categories
    cate_list = []
    try:
        cats = soup.find(id="mw-normal-catlinks").find_all("a")
        for cat in cats:
            cat = str(cat["title"])
            if len(re.findall(r"Help:Category", cat)) > 0:
                continue
            cat = re.sub(r"^Category:", "", cat)
            cate_list.append(cat)
    except:
        print("Error! No category found...")
            
    df_with_cat.at[i, "Category"] = cate_list

display(df_with_cat)

  1%|▋                                                                               | 16/2000 [00:08<15:30,  2.13it/s]

Error! No category found...


 16%|████████████▍                                                                  | 314/2000 [02:42<09:15,  3.03it/s]

Error! No category found...


 54%|██████████████████████████████████████████▏                                   | 1081/2000 [08:24<09:33,  1.60it/s]

Error! No category found...


 60%|██████████████████████████████████████████████▌                               | 1195/2000 [09:16<13:08,  1.02it/s]

Error! No category found...


 60%|███████████████████████████████████████████████▏                              | 1210/2000 [09:24<06:24,  2.06it/s]

Error! No category found...


 65%|██████████████████████████████████████████████████▌                           | 1297/2000 [09:56<05:45,  2.03it/s]

Error! No category found...


 67%|████████████████████████████████████████████████████▌                         | 1348/2000 [10:20<05:21,  2.03it/s]

Sleeping...


 68%|███████████████████████████████████████████████████▍                        | 1355/2000 [11:18<1:17:05,  7.17s/it]

Sleeping...


 72%|███████████████████████████████████████████████████████▉                      | 1435/2000 [12:49<03:04,  3.06it/s]

Error! No category found...


 78%|████████████████████████████████████████████████████████████▋                 | 1556/2000 [13:52<03:29,  2.12it/s]

Error! No category found...


 82%|███████████████████████████████████████████████████████████████▉              | 1639/2000 [14:30<02:24,  2.50it/s]

Error! No category found...


 82%|████████████████████████████████████████████████████████████████              | 1643/2000 [14:32<02:07,  2.80it/s]

Error! No category found...


 84%|█████████████████████████████████████████████████████████████████▋            | 1684/2000 [14:53<02:07,  2.47it/s]

Error! No category found...


 84%|█████████████████████████████████████████████████████████████████▊            | 1687/2000 [14:54<02:07,  2.45it/s]

Error! No category found...


 85%|██████████████████████████████████████████████████████████████████▌           | 1706/2000 [15:04<02:27,  2.00it/s]

Error! No category found...


 85%|██████████████████████████████████████████████████████████████████▌           | 1707/2000 [15:04<02:32,  1.92it/s]

Error! No category found...


 86%|██████████████████████████████████████████████████████████████████▉           | 1715/2000 [15:07<01:28,  3.21it/s]

Error! No category found...


 87%|████████████████████████████████████████████████████████████████████          | 1746/2000 [15:18<00:59,  4.25it/s]

Error! No category found...


 88%|████████████████████████████████████████████████████████████████████▎         | 1753/2000 [15:20<01:28,  2.80it/s]

Error! No category found...


 88%|████████████████████████████████████████████████████████████████████▌         | 1757/2000 [15:21<01:15,  3.22it/s]

Error! No category found...


 88%|████████████████████████████████████████████████████████████████████▊         | 1766/2000 [15:27<01:34,  2.47it/s]

Error! No category found...


 92%|███████████████████████████████████████████████████████████████████████▍      | 1833/2000 [17:57<00:56,  2.97it/s]

Error! No category found...


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [19:01<00:00,  1.75it/s]


Unnamed: 0,Title,Description,Link,Category
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping,[Web scraping]
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...,[Data mining]
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat,"[Microformats, Knowledge representation, Seman..."
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic,"[Knowledge representation languages, ISO stand..."
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,"[Lisp programming language family, Lisp (progr..."
...,...,...,...,...
19995,Order Police battalions,The Order Police battalions were militarised f...,https://en.wikipedia.org/wiki/Order_Police_bat...,
19996,War crimes of the Wehrmacht,"During World War II, the German Wehrmacht (com...",https://en.wikipedia.org/wiki/War_crimes_of_th...,
19997,War crimes in the Gaza War (2008–2009),Accusations of violations regarding internatio...,https://en.wikipedia.org/wiki/War_crimes_in_th...,
19998,Richard Kemp,\n,https://en.wikipedia.org/wiki/Richard_Kemp,


In [68]:
display(df_with_cat.iloc[0:16000])

Unnamed: 0,Title,Description,Link,Category
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping,[Web scraping]
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...,[Data mining]
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat,"[Microformats, Knowledge representation, Seman..."
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic,"[Knowledge representation languages, ISO stand..."
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,"[Lisp programming language family, Lisp (progr..."
...,...,...,...,...
15995,Exaggeration,Exaggeration is the representation of somethin...,https://en.wikipedia.org/wiki/Exaggeration,"[Cognitive biases, Defence mechanisms, Diversi..."
15996,Henry Samueli,"Henry Samueli (born September 20, 1954) is an ...",https://en.wikipedia.org/wiki/Henry_Samueli,"[1954 births, American billionaires, American ..."
15997,San Diego Gulls,The San Diego Gulls are a professional ice hoc...,https://en.wikipedia.org/wiki/San_Diego_Gulls,"[San Diego Gulls, Anaheim Ducks minor league a..."
15998,National Association of Intercollegiate Athletics,The National Association of Intercollegiate At...,https://en.wikipedia.org/wiki/National_Associa...,[National Association of Intercollegiate Athle...


Save the data to a csv file.

In [69]:
df_with_cat.iloc[0:16000].to_csv('data_with_cat.csv', index=False)

For scraping.

In [37]:
next_link = "https://en.wikipedia.org/wiki/ISLISP"

In [38]:
for i in tqdm(range(0, 5)):
    df, next_link = Scraper(next_link, df)
    
display(df)
print(next_link)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  2.51it/s]


Unnamed: 0,Title,Description,Link,Category
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping,
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...,
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat,
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic,
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,
...,...,...,...,...
27911,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,"[Help:Category, Category:Lisp programming lang..."
27912,NetLogo,NetLogo is a programming language and integrat...,https://en.wikipedia.org/wiki/NetLogo,"[Help:Category, Category:Logo programming lang..."
27913,William Clinger (computer scientist),William D. Clinger is an associate professor i...,https://en.wikipedia.org/wiki/William_Clinger_...,"[Help:Category, Category:Living people, Catego..."
27914,"University of California, Berkeley",\n,https://en.wikipedia.org/wiki/University_of_Ca...,"[Help:Category, Category:University of Califor..."


https://en.wikipedia.org/wiki/C_(programming_language)


In [39]:
df = df.drop_duplicates(subset=["Title"])
display(df)
print(next_link)

Unnamed: 0,Title,Description,Link,Category
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping,
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...,
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat,
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic,
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,
...,...,...,...,...
27908,Category:Women's association football clubs by...,This category has the following 101 subcategor...,https://en.wikipedia.org/wiki/Category:Women%2...,
27909,Category:Women's football clubs in Japan,This category has the following 3 subcategorie...,https://en.wikipedia.org/wiki/Category:Women%2...,
27910,Category:1947 establishments in Slovenia,"The following 10 pages are in this category, o...",https://en.wikipedia.org/wiki/Category:1947_es...,
27912,NetLogo,NetLogo is a programming language and integrat...,https://en.wikipedia.org/wiki/NetLogo,"[Help:Category, Category:Logo programming lang..."


https://en.wikipedia.org/wiki/C_(programming_language)


In [40]:
df.to_csv('data.csv', index=False)

In [9]:
read_df = pd.read_csv("data.csv", index_col=False)
display(read_df)

Unnamed: 0,Title,Description,Link
0,Web scraping,"Web scraping, web harvesting, or web data extr...",https://en.wikipedia.org/wiki/Web_scraping
1,Wrapper (data mining),Wrapper in data mining is a procedure that ext...,https://en.wikipedia.org/wiki/Wrapper_(data_mi...
2,Microformat,Microformats (μF)[note 1] are a set of defined...,https://en.wikipedia.org/wiki/Microformat
3,Common Logic,Common Logic (CL) is a framework for a family ...,https://en.wikipedia.org/wiki/Common_Logic
4,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP
...,...,...,...
24904,American Samoa,American Samoa[c] is an unincorporated territo...,https://en.wikipedia.org/wiki/American_Samoa
24905,U.S. provisional government of New Mexico,Under the provisions of the Kearny Code as pro...,https://en.wikipedia.org/wiki/U.S._provisional...
24906,United States Military Government of the Ryuky...,The United States Military Government of the R...,https://en.wikipedia.org/wiki/United_States_Mi...
24907,Imperial Chinese missions to the Ryukyu Kingdom,Imperial Chinese missions to the Ryukyu Kingdo...,https://en.wikipedia.org/wiki/Imperial_Chinese...


# Scraping Test
The following cell returns a DataFrame called "test" with information we need.

In [4]:
test = pd.DataFrame()
next_link = "https://en.wikipedia.org/wiki/ISLISP"
for i in tqdm(range(0, 10)):
    test, next_link = Scraper(next_link, test)
    
display(test)
print(next_link)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.69it/s]


Unnamed: 0,Title,Description,Link,Category
0,ISLISP,ISLISP (also capitalized as ISLisp) is a progr...,https://en.wikipedia.org/wiki/ISLISP,"[Lisp programming language family, Lisp (progr..."
1,International Standard Music Number,The International Standard Music Number or ISM...,https://en.wikipedia.org/wiki/International_St...,"[ISO standards, Checksum algorithms, Identifie..."
2,International Bank Account Number,The International Bank Account Number (IBAN) i...,https://en.wikipedia.org/wiki/International_Ba...,"[Banking terms, Financial regulation, ISO stan..."
3,ISO 11170,ISO 11170:2003 is an international standard wh...,https://en.wikipedia.org/wiki/ISO_11170,"[ISO standards, Standards and measurement stubs]"
4,International Standard Name Identifier,The International Standard Name Identifier (IS...,https://en.wikipedia.org/wiki/International_St...,"[ISO standards, Unique identifiers, Library ca..."
5,FDI World Dental Federation notation,"FDI World Dental Federation notation (also ""FD...",https://en.wikipedia.org/wiki/FDI_World_Dental...,"[Dentistry terminology, Human mouth anatomy, I..."
6,Wayback Machine,The Wayback Machine is a digital archive of th...,https://en.wikipedia.org/wiki/Wayback_Machine,"[History of the Internet, Internet Archive pro..."
7,Internet Archive,The Internet Archive is an American nonprofit ...,https://en.wikipedia.org/wiki/RECAP_US_Federal...,"[Internet Archive, 1996 establishments in Cali..."
8,Boston Public Library,The Boston Public Library is a municipal publi...,https://en.wikipedia.org/wiki/Boston_Public_Li...,"[Libraries in Boston, Organizations establishe..."
9,Boston Common,The Boston Common is a public park in downtown...,https://en.wikipedia.org/wiki/Boston_Common,"[Boston Common, 1634 establishments in the Mas..."


https://en.wikipedia.org/wiki/National_Register_of_Historic_Places_listings_in_Methuen,_Massachusetts
