# Scraping Websites for Text Content

In [None]:
from bs4 import BeautifulSoup # the library that provides the HTTP calls etc.
import requests


url = "https://research.google/research/pubs/" # The url we want to scrape
reqs = requests.get(url) # open the page and get the html
soup = BeautifulSoup(reqs.text ,"html.parser") # parse html
links = [l.get("href") for l in soup.find_all("a")] # identify the links
links = [l for l in links if (l is not None)] # filter the empty links


links

In [None]:
# We inspect the htmp source code of the website to identify where to find what we want to crawl


url = "https://research.google/pubs/?&category=2024" # The url we want to scrape
reqs = requests.get(url) # open the page and get the html
soup = BeautifulSoup(reqs.text ,"html.parser") # parse html
links = [l.get("href") for l in soup.find_all("a", class_= "row-card__heading headline-6 glue-link")] # identify the links
links = [l for l in links if (l is not None)] # filter the empty links


links

#### Get the titles and abstracts

In [None]:
from bs4 import BeautifulSoup
import urllib.request

title = list()
abstract = list()


for l in links:
    try:
        html = urllib.request.urlopen(l)
    except Exception as e:
        print(e)
    else:
        for l in links:
            raw = html.read() 
            soup = BeautifulSoup(raw) 
            title = title + [t.string.strip() for t in soup.find_all("h1")]
            try:
                abstract = abstract + [a.string.strip() for a in soup.find_all("p")]
            except Exception as e:
                print(e)
                continue
                
title

In [None]:
papers = list(zip(title, abstract))
papers

#### Analyze with nltk

In [None]:
words = [ w.lower() for t in abstract for w in t.split() ]

words[:10]

In [None]:
import nltk # Natural Language Tool Kit
nltk.download('punkt')

In [None]:
# We use the nltk to filter out certain words we don't want in our analysis

stop = nltk.corpus.stopwords.words('english')
words_without_stopwords = [i for i in words if i not in stop]
print (words_without_stopwords[:10])

In [None]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()


raw = " ".join(title)
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)

finder = BigramCollocationFinder.from_words(text)
finder.nbest(bigram_measures.pmi, 10)

In [None]:
from prettytable import PrettyTable
from collections import Counter 

porter = nltk.PorterStemmer()
text_stemmed = [porter.stem(t).lower() for t in tokens if t not in stop]

pt = PrettyTable(field_names=['Stem', 'Frequency']) 
c = Counter(text_stemmed)
[ pt.add_row(kv) for kv in c.most_common()[:20] ]
pt.align['Wortstamm'], pt.align['Häufigkeit'] = 'l', 'r' # Set column alignment
print(pt)

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
from prettytable import PrettyTable

wnl = nltk.WordNetLemmatizer()
tokens_lower = [w.lower() for w in tokens]
text_lemmata = [wnl.lemmatize(t) for t in tokens_lower if t not in stop]

text_lemmata = filter(lambda word: word not in ',-:', text_lemmata)

pt = PrettyTable(field_names=['Lemma', 'Frequency']) 
c = Counter(text_lemmata)
[ pt.add_row(kv) for kv in c.most_common()[:20] ]
pt.align['Stem'], pt.align['Frequency'] = 'l', 'r' # Set column alignment
print(pt)

In [None]:
text.concordance('learning')

In [None]:
print(text.similar(""))