# Chapter 6 - Data Sourcing via Web

## Segment 1 - Objects in BeautifulSoup

In [None]:


import sys
print(sys.version)
from bs4 import BeautifulSoup
### BeautifulSoup objects 
our_html_document = # parsing1
our_soup_object = BeautifulSoup(our_html_document, 'html.parser')
print(our_soup_object)
print(our_soup_object.prettify()[0:300])
### Tag objects
#### Tag names
soup_object = BeautifulSoup('<h1 attribute_1 = "Heading Level 1"">Future Trends for IoT in 2018</h1>', "lxml")

tag = soup_object.h1
type(tag)
print(tag)
tag.name
tag.name = 'heading 1'
tag
tag.name
#### Tag attributes
soup_object = BeautifulSoup('<h1 attribute_1 = "Heading Level 1"">Future Trends for IoT in 2018</h1>', "lxml")
tag = soup_object.h1
tag
tag['attribute_1']
tag.attrs
tag['attribute_2'] = 'Heading Level 1*'
tag.attrs
tag
del tag['attribute_2']
tag
del tag['attribute_1']
tag.attrs
#### Navigating a parse tree using tags
# First we will recreate our original parse tree.
our_html_document = # parsing2
our_soup_object = BeautifulSoup(our_html_document, 'html.parser')
our_soup_object.head
our_soup_object.title
our_soup_object.body.b
our_soup_object.body
our_soup_object.li
our_soup_object.a


## Segment 2 - NavigatableString Objects

In [None]:
# Chapter 6 - Data Sourcing via Web

import sys
print(sys.version)
from bs4 import BeautifulSoup
### NavigatableString objects
soup_object = BeautifulSoup('<h1 attribute_1 = "Heading Level 1"">Future Trends in IoT in 2018</h1>', "lxml")

tag = soup_object.h1

type(tag)
tag.name
tag.string
type(tag.string)
our_navigatable_string = tag.string
our_navigatable_string
our_navigatable_string.replace_with('NaN')
tag.string
#### Utilizing NavigatableString objects
our_html_document = # parsing3

our_soup_object = BeautifulSoup(our_html_document, 'html.parser')
for string in our_soup_object.stripped_strings:
    print(repr(string))
first_link= our_soup_object.a
print(first_link)
first_link.parent
first_link.string
first_link.string.parent


## Segment 3 - Data parsing

In [None]:
# Chapter 6 -  Data Sourcing via Web

from bs4 import BeautifulSoup

import urllib
import urllib.request
import re
with urllib.request.urlopen('https://raw.githubusercontent.com/BigDataGal/Data-Mania-Demos/master/IoT-2018.html') as response:
    html = response.read()
soup = BeautifulSoup(html, "lxml")
type(soup)
### Parsing your data
print(soup.prettify()[0:100])
### Getting data from a parse tree
text_only = soup.get_text()
print(text_only)
### Searching and retrieving data from a parse tree
#### Retrieving tags by filtering with name arguments
soup.find_all("li")
##### Retrieving tags by filtering with keyword arguments
soup.find_all(id="link 7")
##### Retrieving tags by filtering with string arguments
soup.find_all('ol')
##### Retrieving tags by filtering with list objects
soup.find_all(['ol', 'b'])
##### Retrieving tags by filtering with regular expressions
t = re.compile("t")
for tag in soup.find_all(t):
    print(tag.name)
##### Retrieving tags by filtering with a Boolean value
for tag in soup.find_all(True):
    print(tag.name)
##### Retrieving weblinks by filtering with string objects
for link in soup.find_all('a'):
    print(link.get('href'))
##### Retrieving strings by filtering with regular expressions
soup.find_all(string=re.compile("data"))


## Segment 4 - Web scraping

In [None]:
# Chapter 6 -  Data Sourcing via Web

from bs4 import BeautifulSoup
import urllib.request
from IPython.display import HTML
import re
r = urllib.request.urlopen('https://analytics.usa.gov/').read()
soup = BeautifulSoup(r, "lxml")
type(soup)
print(soup.prettify()[:100])
for link in soup.find_all('a'):
    print(link.get('href'))
print(soup.get_text())
print(soup.prettify()[0:1000])
for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
    print(link)
type(link)
file = open("parsed_data.txt", "w")
for link in soup.findAll('a', attrs={'href': re.compile("^http")}):
    soup_link = str(link)
    print(soup_link)
    file.write(soup_link)
file.flush()
file.close()
%pwd


## Segment 5 - Introduction to NLP

In [None]:
# Chapter 6 -  Data Sourcing via Web
## Segment 5 - Introduction to NLP
import nltk
text = "On Wednesday, the Association for Computing Machinery, the world’s largest society of computing professionals, announced that Hinton, LeCun and Bengio had won this year’s Turing Award for their work on neural networks. The Turing Award, which was introduced in 1966, is often called the Nobel Prize of computing, and it includes a $1 million prize, which the three scientists will share."
nltk.download('punkt')
<h3>Sentence Tokenizer</h3>
from nltk.tokenize import sent_tokenize
sent_tk = sent_tokenize(text)
print("Sentence tokenizing the text: \n")
print(sent_tk)
### Word Tokenizer
from nltk.tokenize import word_tokenize
word_tk = word_tokenize(text)
print("Word tokenizing the text: \n")
print(word_tk)
### Removing stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

sw = set(stopwords.words("english"))
print("Stop words in English language are: \n")
print(sw)
filtered_words = [w for w in word_tk if not w in sw]

print("The text after removing stop words \n")
print(filtered_words)
<h3>Stemming</h3>
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

port_stem = PorterStemmer()
stemmed_words = []

for w in filtered_words:
    stemmed_words.append(port_stem.stem(w))
    
print("Filtered Sentence: \n", filtered_words, "\n")
print("Stemmed Sentence: \n", stemmed_words)
# Lemmatizing
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

lemm_words = []

for i in range(len(filtered_words)):
    lemm_words.append(lem.lemmatize(filtered_words[i]))
    
print(lemm_words)
<h3>Parts of Speech Tagging</h3>

nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
pos_tagged_words = pos_tag(word_tk)

print(pos_tagged_words)
<h3>Frequency Distribution Plots</h3>

from nltk.probability import FreqDist
fd = FreqDist(word_tk)
print(fd)
import matplotlib.pyplot as plt
fd.plot(30, cumulative=False)
plt.show()
fd_alpha = FreqDist(text)
print(fd_alpha)
fd_alpha.plot(30, cumulative=False)




