# Chapter 6 - Data Sourcing via Web


## QA

1. What is the difference between stemming and lemmatization?

Stemming reduces words to its root word and lemmatization reduces words to its base word.
Stemming is the process of reducing the inflectional forms of each word to a common root word and lemmatizing is process of reducing the inflectional forms of each word to its base word.

## Import

In [None]:
import sys
from bs4 import BeautifulSoup as bs4
import pandas as pd

import urllib
import urllib.request
import re
from IPython.display import HTML

## Segment 1 - Objects in BeautifulSoup

In [None]:
print(sys.version)

### BeautifulSoup objects 

In [None]:
with open('data/parsing1.txt', 'r') as file:
    our_html_document = file.read()

soup = bs4(our_html_document, 'html.parser')

In [None]:
print(soup.prettify()[0:50])

### Tag objects



#### Tag names

In [None]:
tag = soup.h1
print(tag)
type(tag)

In [None]:
print(tag)
print(tag.name)


In [None]:
tag.name = 'heading 1'
print(tag)
print(tag.name)

#### Tag attributes

In [None]:
soup_atr = bs4('<h1 attribute_1 = "Heading Level 1"">Future Trends for IoT in 2018</h1>', 'html.parser')
tag = soup_atr.h1
tag

In [None]:
tag['attribute_1']
print(tag.attrs)
tag['attribute_2'] = 'Heading Level 1*'
print(tag.attrs)
print(tag)
del tag['attribute_2']
print(tag)
del tag['attribute_1']
print(tag.attrs)


#### Navigating a parse tree using tags

In [None]:

with open('data/parsing2.txt', 'r') as file:
    our_html_document = file.read()

sp = bs4(our_html_document, 'html.parser')

print(sp.head,'\n')
print(sp.title,'\n')
print(sp.body.b,'\n')
#print(sp.body,'\n')
print(sp.li,'\n')
print(sp.a,'\n')

## Segment 2 - NavigatableString Objects

### NavigatableString objects

In [None]:
soup_object = bs4('<h1 attribute_1 = "Heading Level 1"">Future Trends in IoT in 2018</h1>', "html.parser")
tag = soup_object.h1
print(type(tag))
print(tag.name)
print(tag.string)
print(type(tag.string))

In [None]:
our_navigatable_string = tag.string
print(our_navigatable_string)

our_navigatable_string.replace_with('NaN')
print(tag.string)

#### Utilizing NavigatableString objects

In [None]:
with open('data/parsing3.txt', 'r') as file:
    our_html_document = file.read()

sp = bs4(our_html_document, 'html.parser')

In [None]:
for string in sp.stripped_strings:
    #print(repr(string))
    a = 1

In [None]:
first_link = sp.a
print(first_link)
print(first_link.parent)
print(first_link.string)
print(first_link.string.parent)

## Segment 3 - Data parsing

In [None]:
with open('data/parsing4.txt', 'r') as file:
    our_html_document = file.read()

soup = bs4(our_html_document, 'html.parser')

In [None]:
type(soup)

In [None]:
print(soup.prettify()[0:50])

In [None]:
text_only = soup.get_text()
#print(text_only)

In [None]:
soup.find_all("li")
soup.find_all(id="link 7")

In [None]:
#soup.find_all('ol')
#soup.find_all(['ol', 'b'])

In [None]:
t = re.compile("t")
for tag in soup.find_all(t):
    print(tag.name)

In [None]:
for tag in soup.find_all(True):
    print(tag.name)

In [None]:
for link in soup.find_all('a'):
    print(link.get('href'))

In [None]:
soup.find_all(string=re.compile("data"))

## Segment 4 - Web scraping

In [None]:
import requests
from bs4 import BeautifulSoup as bs4

url = 'https://analytics.usa.gov'
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", 
            "Accept-Encoding":"gzip, deflate, br", 
            "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 
            "DNT":"1",
            "Connection":"close", 
            "Upgrade-Insecure-Requests":"1"
            }

def connect(URL):
    page = requests.get(URL, headers=headers)
    soup1 = bs4(page.content, "html.parser")
    return bs4(soup1.prettify(), 'html.parser')

soup = connect(url)

In [None]:
for link in soup.find_all('a'):
    print(link.get('href'))

In [None]:
#print(soup.get_text())
#print(soup.prettify()[0:1000])
for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
    print(link)
type(link)

In [None]:
file = open("data/save_files/parsed_data.txt", "w")
for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
    soup_link = str(link)
    print(soup_link)
    file.write(soup_link)
file.flush()
file.close()
%pwd

In [None]:
import certifi
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


## Segment 5 - Introduction to NLP

In [None]:
import nltk

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from nltk.probability import FreqDist

import matplotlib.pyplot as plt

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
# nltk.download()

### Intro NLP

In [None]:
text = "On Wednesday, the Association for Computing Machinery, the world's largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year's Turing Award for their work on neural networks. The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share."

In [None]:
tokens = nltk.word_tokenize(text)

In [None]:
sent_tk = sent_tokenize(text)
print("Sentence tokenizing the text: \n")
print(sent_tk)

#### Word Tokenizer

In [None]:
word_tk = word_tokenize(text)
print("Word tokenizing the text: \n")
print(word_tk)

### Cleaning and stemming textual data

#### Removing stop words

In [None]:
#nltk.download('stopwords')

In [None]:
sw = set(stopwords.words("english"))
print("Stop words in English language are: \n")
print(sw)

In [None]:
sw_ru= set(stopwords.words("russian"))
print("Stop words in Russian language are: \n")
print(sw_ru)

In [None]:
filtered_words = [w for w in word_tk if not w in sw]
print("The text after removing stop words \n")
print(filtered_words)

#### Stemming

In [None]:
port_stem = PorterStemmer()
stemmed_words = []

for w in filtered_words:
    stemmed_words.append(port_stem.stem(w))
    
print("Filtered Sentence: \n", filtered_words, "\n")
print("Stemmed Sentence: \n", stemmed_words)

### Lemmatizing and Analysing textual

In [None]:
# nltk.download('wordnet')

lem = WordNetLemmatizer()
stem = PorterStemmer()
lemm_words = []

for i in range(len(filtered_words)):
    lemm_words.append(lem.lemmatize(filtered_words[i]))

print(filtered_words,'\n')    
print(lemm_words,'\n')


#### Parts of Speech Tagging

In [None]:
# nltk.download('averaged_perceptron_tagger')

pos_tagged_words = pos_tag(word_tk)
print(pos_tagged_words)

#### Frequency Distribution Plots

In [None]:
from pylab import rcParams
import seaborn as sb

%matplotlib inline
rcParams['figure.figsize'] = 6,5
sb.set(style="whitegrid")

In [None]:
fd = FreqDist(word_tk)
print(fd)

In [None]:
fd.plot(30, cumulative=False)
plt.show()

In [None]:
fd_alpha = FreqDist(text)
print(fd_alpha)
fd_alpha.plot(30, cumulative=False)