## Web scraping

In [86]:
# !pip install selenium
#!pip install webdriver_manager

## The following is code to run a python controlled chrome browser environment

### We will

### 1. Open the wikipedia page for someone
### 2. Gather all the links in the page
### 3. Maintain a set of links that we visited, we start visiting unvisited webpages
### 4. Extract text from each page
### 5. Filter clean the text

## Download chromedriver [here](https://googlechromelabs.github.io/chrome-for-testing)

Keep the chromedriver binary in the same directory as the jupyter notebook


In [3]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Using cached wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Using cached PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m53.8 MB/s[0m eta [36m0:0

In [4]:
import time
import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

## Add the path and user profile to chromedriver. You should add your own path. User profile is optional.
chrome_driver_path = "/home/yaning/Documents/LLM/E5/chromedriver-linux64/chromedriver"
chromium_path = "/home/yaning/Documents/LLM/E5/chrome-linux64/chrome"
# user_profile_path = "Add/your/user/profile/path"

chrome_options = Options()
chrome_options.binary_location = chromium_path
# chrome_options.add_argument(f"user-data-dir={user_profile_path}")
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_options.add_argument("--no-sandbox")  # Disable sandboxing
# Set up the Chrome driver
service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

In [5]:
# Open a website
driver.get("https://www.sachsen-lese.de/streifzuege/mundartliches/")


In [89]:
# h2_element = driver.find_element(By.Class, 'listeText')
# h2_text = h2_element.text
# print(h2_text)

## We find all urls in the page

In [6]:
anchor_elements = driver.find_elements(By.TAG_NAME, "a")
# Extract the href attribute from each anchor element
urls = []

for anchor in anchor_elements:
    url = anchor.get_attribute("href")
    if url != '' and url[:54] == "https://www.sachsen-lese.de/streifzuege/mundartliches/":
        urls.append(url)
# urls = [anchor.get_attribute("href") for anchor in anchor_elements]
url_set = set(urls)        

In [91]:
test = "https://www.sachsen-lese.de/streifzuege/mundartliches/"
len(test)
test[:54]

'https://www.sachsen-lese.de/streifzuege/mundartliches/'

In [12]:
test = "https://www.sachsen-lese.de/streifzuege/mundartliches/a-schalle-kaffee/"
test[54:-1]

'a-schalle-kaffee'

In [None]:
url_set

TypeError: 'set' object is not subscriptable

In [5]:
url_set

{'https://www.sachsen-lese.de/streifzuege/mundartliches/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/#',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/#content',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/#mainnav',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/a-schalle-kaffee/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/ae-fillosohf/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/ae-philospphisches-gemiedhe/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/am-ostermorgen-erschoepfende-auskunft/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/an-ae-gachelofen/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/an-dn-mai/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/april/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/auf-saechsisch/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/aus-dr-schul/',
 'https://www.sachsen-lese.de/streifzuege/mundartliches/bee

In [8]:
from urllib.parse import urlparse, urlunparse

def remove_url_fragments(url_set):
  cleaned_urls = set()
  for url in url_set:
    parsed_url = urlparse(url)
    cleaned_url = urlunparse(parsed_url._replace(fragment=''))
    if cleaned_url is not None and cleaned_url not in cleaned_urls and cleaned_url != b'':
      cleaned_urls.add(cleaned_url)
  return cleaned_urls

cleaned_url_set = remove_url_fragments(url_set)
print(cleaned_url_set)

{'https://www.sachsen-lese.de/streifzuege/mundartliches/s-raachermannel/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/gebet-eines-arzgebirgersch/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/vogtlaender-madle/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/ewos-as-dr-schul/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/saechsische-elegie/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/raechenlied/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/ae-philospphisches-gemiedhe/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/dr-lumpenma/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/april/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/der-versteckte-stapilz/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/de-ficht-ofn-fels/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/saechsisches-allzusaechsisches/', 'https://www.sachsen-lese.de/streifzuege/mundartliches/wenns-in-winter-schne

### We only visit webpages which we didn't visit in the past and collect text information from the page

In [13]:
visited_urls = set()
page_text_dict = dict()
poem_names = []
# Loop through the URLs
# for url in itertools.islice(cleaned_url_set, 10): # Limit the number of URLs visited for demonstration purposes
for url in cleaned_url_set:
    if url is not None and url not in visited_urls:
        # Visit the URL
        print(url)
        driver.get(url)
        page_text = ""
        poem_names.append(url[54:-1])
        absatz_divs = driver.find_elements(By.CLASS_NAME, "absatz")

        for div in absatz_divs:
            page_text += div.get_attribute("textContent")
            # paragraphs = div.find_elements(By.TAG_NAME, "p")
            # for p in paragraphs:
            #     if p.is_displayed():
            #         page_text.append(p.text)
            #     else:
            #         print("This <p> is hidden.")

        # Store the page text in the dictionary
        page_text_dict[url] = page_text
        visited_urls.add(url)
        

https://www.sachsen-lese.de/streifzuege/mundartliches/s-raachermannel/
https://www.sachsen-lese.de/streifzuege/mundartliches/gebet-eines-arzgebirgersch/
https://www.sachsen-lese.de/streifzuege/mundartliches/vogtlaender-madle/
https://www.sachsen-lese.de/streifzuege/mundartliches/ewos-as-dr-schul/
https://www.sachsen-lese.de/streifzuege/mundartliches/saechsische-elegie/
https://www.sachsen-lese.de/streifzuege/mundartliches/raechenlied/
https://www.sachsen-lese.de/streifzuege/mundartliches/ae-philospphisches-gemiedhe/
https://www.sachsen-lese.de/streifzuege/mundartliches/dr-lumpenma/
https://www.sachsen-lese.de/streifzuege/mundartliches/april/
https://www.sachsen-lese.de/streifzuege/mundartliches/der-versteckte-stapilz/
https://www.sachsen-lese.de/streifzuege/mundartliches/de-ficht-ofn-fels/
https://www.sachsen-lese.de/streifzuege/mundartliches/saechsisches-allzusaechsisches/
https://www.sachsen-lese.de/streifzuege/mundartliches/wenns-in-winter-schneie-tut/
https://www.sachsen-lese.de/st

In [15]:
poem_names

['s-raachermannel',
 'gebet-eines-arzgebirgersch',
 'vogtlaender-madle',
 'ewos-as-dr-schul',
 'saechsische-elegie',
 'raechenlied',
 'ae-philospphisches-gemiedhe',
 'dr-lumpenma',
 'april',
 'der-versteckte-stapilz',
 'de-ficht-ofn-fels',
 'saechsisches-allzusaechsisches',
 'wenns-in-winter-schneie-tut',
 'mei-scheenes-saksenland',
 'wenn-tausend-voegle-singe',
 'im-familchenbad',
 'waldfrieden',
 'does-is-nu-su-mei-labn',
 'in-winter-bei-dan-groessten-schnee',
 'richard-strauss-und-die-dresdner-hofoper',
 'beim-aufbau-der-christecke',
 'bluehe-un-welken',
 'vum-kobelchen',
 'gruegeniffte',
 'beschauliche-bilanz',
 'de-glaeser-minel',
 'wie-dr-hustersch-lob-zu-sann-neie-haisel-kumme-is',
 'does-will-e-lama-sei',
 'da-haustuer-in-wasserbottich',
 'vun-dr-langeweil',
 'o-du-maigriener-wald',
 'das-heilig-obnd-lied',
 'neigoahrschwuensch-sprueche',
 'sei-weihnachtsfichtel',
 'an-ae-gachelofen',
 'laewerworscht',
 'zu-neijahr',
 'dialekte-in-sachsen/wie-s-dr-schnawel-een-gibt-ein-dialekte

In [16]:
import re
regex_poem_names = []
for name in poem_names:
    regex_poem_names.append(re.sub(r"-", " ", name))

In [17]:
regex_poem_names

['s raachermannel',
 'gebet eines arzgebirgersch',
 'vogtlaender madle',
 'ewos as dr schul',
 'saechsische elegie',
 'raechenlied',
 'ae philospphisches gemiedhe',
 'dr lumpenma',
 'april',
 'der versteckte stapilz',
 'de ficht ofn fels',
 'saechsisches allzusaechsisches',
 'wenns in winter schneie tut',
 'mei scheenes saksenland',
 'wenn tausend voegle singe',
 'im familchenbad',
 'waldfrieden',
 'does is nu su mei labn',
 'in winter bei dan groessten schnee',
 'richard strauss und die dresdner hofoper',
 'beim aufbau der christecke',
 'bluehe un welken',
 'vum kobelchen',
 'gruegeniffte',
 'beschauliche bilanz',
 'de glaeser minel',
 'wie dr hustersch lob zu sann neie haisel kumme is',
 'does will e lama sei',
 'da haustuer in wasserbottich',
 'vun dr langeweil',
 'o du maigriener wald',
 'das heilig obnd lied',
 'neigoahrschwuensch sprueche',
 'sei weihnachtsfichtel',
 'an ae gachelofen',
 'laewerworscht',
 'zu neijahr',
 'dialekte in sachsen/wie s dr schnawel een gibt ein dialekte

In [19]:
import numpy as np

In [20]:
names = np.array(regex_poem_names)

In [22]:
np.save("poem_names.npy", names)

In [96]:
print(page_text_dict["https://www.sachsen-lese.de/streifzuege/mundartliches/wu-heier-bluss-de-schwamme-stacken/"])


Alte Buche im Wald. (1)
Gestern war ich drauß‘n
Waldkaa
Schwammel war ze sah.Bie
überol rümhargewürgtun
hatt bluß kalte Ba.Ne
Wag zengst naus durchn Klötzerwald,dan
harrlichn Buchnwald na,de
Hirschlack nauf bis ofn Barg,´s
kam bal der Mittig ra.

Nu
dacht ich: Wart när, nu is Schluß,  
die
Laaferei härt auf.´s
Sackel stok noch leer in Rockun‘s
Masserle ubn drauf.De
Hemmbarger Wand ging‘s laut neizu.Vun
Schwamme gar kaa Red.´s
war nischt ze sah, un vunne Gahr,do
standen se wie geseet.

 "Stallhasen". (2)
Un wenn ich en getroffen hob,dan
ging‘s genau wie mir:´s
tat geder wie spaziern bluß gieh,´s
war geden wie schenieren.Ich
bie noch nei zen Poller-Mannun
hob for mich gelacht.A
Schieböcker un aa mol Bierhot
allus gutgemacht.

Un
hamzu hob ich Hosenfutterfor
de Hasle ogeruppt.Die
sei im Stalle kreiz un quarvor
Frad rümhargehuppt.Mr
muß abn Gott for allis dankenof
unnerer schinn Ard.Un
wenn‘s aa kaane Schwamme gob,der
Spaß war doch wos wart.

BildnachweisBild 1: Roland ReißmannKopfbild, B

In [97]:
print(page_text_dict.values())
print(page_text_dict.keys())

dict_values(['\nAlte Buche im Wald. (1)\nGestern war ich drauß‘n\nWaldkaa\nSchwammel war ze sah.Bie\nüberol rümhargewürgtun\nhatt bluß kalte Ba.Ne\nWag zengst naus durchn Klötzerwald,dan\nharrlichn Buchnwald na,de\nHirschlack nauf bis ofn Barg,´s\nkam bal der Mittig ra.\n\nNu\ndacht ich: Wart när, nu is Schluß,  \ndie\nLaaferei härt auf.´s\nSackel stok noch leer in Rockun‘s\nMasserle ubn drauf.De\nHemmbarger Wand ging‘s laut neizu.Vun\nSchwamme gar kaa Red.´s\nwar nischt ze sah, un vunne Gahr,do\nstanden se wie geseet.\n\n "Stallhasen". (2)\nUn wenn ich en getroffen hob,dan\nging‘s genau wie mir:´s\ntat geder wie spaziern bluß gieh,´s\nwar geden wie schenieren.Ich\nbie noch nei zen Poller-Mannun\nhob for mich gelacht.A\nSchieböcker un aa mol Bierhot\nallus gutgemacht.\n\nUn\nhamzu hob ich Hosenfutterfor\nde Hasle ogeruppt.Die\nsei im Stalle kreiz un quarvor\nFrad rümhargehuppt.Mr\nmuß abn Gott for allis dankenof\nunnerer schinn Ard.Un\nwenn‘s aa kaane Schwamme gob,der\nSpaß war doch wo

In [98]:
# Change the following line to print the page text for a different URL
collected_url = 'https://en.wikipedia.org/wiki/Alfred_Aho'
print(page_text_dict[collected_url])

KeyError: 'https://en.wikipedia.org/wiki/Alfred_Aho'

In [9]:
# Clean the extracted text
def clean_text(text):
    # Remove leading and trailing white spaces
    text = text.strip()
    # Remove extra white spaces
    text = " ".join(text.split())
    # remove special characters
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    # remove "[edit] from the text"
    text = text.replace("[edit]", "")
    # Remove Main menu, Search, Donate, Create account, Log in, Personal tools, Contents hide, (Top), History, More, Jump to content Top  Toggle
    text = text.replace("Jump to content", "")
    text = text.replace("Top", "")
    text = text.replace("Toggle", "")
    text = text.replace("subsection", "")
    text = text.replace("Main menu", "")
    text = text.replace("Search", "")
    text = text.replace("Donate", "")
    text = text.replace("Create account", "")
    text = text.replace("Log in", "")
    text = text.replace("Personal tools", "")
    text = text.replace("Contents hide", "")
    text = text.replace("(Top)", "")
    text = text.replace("History", "")
    
    return text

for url, text in page_text_dict.items():
    page_text_dict[url] = clean_text(text)

In [None]:
print(page_text_dict["https://en.wikipedia.org/wiki/Nicholas_Lydon"])

   Appearance      the table of contents Nicholas Lydon 3 languages Article Talk Read Edit View history Tools From Wikipedia the free encyclopedia Nicholas Lydon Born 27 February 1957 age 67 citation needed Alma mater University of Leeds BSc University of Dundee PhD Known for Gleevec AnaptysBio BluePrint Medicines Awards Lasker Clinical Award 2009 Japan Prize 2012 FRS 2013 Scientific career Institutions Amgen ScheringPlough CibaGeigy Thesis Studies on the hormonesensitive adenylate cyclase from bovine corpus luteum 1982 Website royalsocietyorgpeoplenicholaslydon Nicholas B Lydon FRS born 27 February 1957 is a British scientist and entrepreneur1 In 2009 he was awarded the Lasker Clinical Award and in 2012 the Japan Prize for the development of Gleevec also known as Imatinib a selective BCRABL inhibitor for the treatment of chronic myeloid leukaemia CML which converted a fatal cancer into a manageable chronic condition2345 67 Educationedit Lydon was educated at Strathallan School near Pe

### Now we store the extracted text

In [20]:
import csv
with open ('/home/yaning/Documents/LLM/E5/extracted.csv', 'w', newline='') as file:
  writer = csv.writer(file, delimiter=',')
  for url in page_text_dict.keys():
    # f.write(f"URL: {url}\n")
    # f.write(f"Text: {text}\n\n")
    writer.writerow(page_text_dict[url])

with open('/home/yaning/Documents/LLM/E5/extracted.csv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter=',')  # Use comma as delimiter
    
    # Write the values of the dictionary as a single row
    writer.writerow(page_text_dict.values())  # Write the values as a row


In [None]:
driver.quit()