In [None]:
!pip install soupsieve
!pip install urllib3
!pip install chardet
!pip install idna
!pip install requests
!pip install beautifulsoup4
!pip install certifi

In [None]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
import json
import string

#### Importing all necessary libraries needed above.



def tag_visibility(element): #Function to read over all the text and return false for every bit of text that falls under a specified list.
    remove = ['head', 'title', 'meta', 'script', 'style', '[document]'] #list of items to remove from list.
    if element.parent.name in remove: 
        return False
    if isinstance(element, Comment):
        return False
    return True

# Return the visible text from the soup of a page, using the filters outlines in tag is visible 'page': Page from the result of a requests.get() operation
def visibility(page):
    soup = BeautifulSoup(page.text, 'html.parser') #Using BeautifulSoup to find all the text in the page.
    text = soup.findAll(text=True) #Finding out which text is visible to users and ensuring to only store that as text.
    visible_text = filter(tag_visibility, text) #Removes all the text that we have blacklisted in the function above.
    return u' '.join(t.strip() for t in visible_text) #


# and return a dictionary of word occurances where the keys are words, and the values the respective number of occurances.
# 'link': string, the URL of the page to parse for (visible) word frequency
# ? Could add options to include hidden words in the evaluation, or blacklist certain tags etc.

def calculate_freq():
    page = requests.get(privacy_policy_url) #Getting and storing the data from the privacy policy page.

    text_from_page = visibility(page).encode('ascii', 'ignore').decode().translate(str.maketrans('', '', string.punctuation)) #Cleaning out all punction and symbols to just have the text.
    words_from_page = text_from_page.split(' ') #Splits up the text using spaces.

    word_count = {} #Create a dictionary to track all words and counts. This will be returned at the end.
    for word in words_from_page: #Create a loop to go over every word in the page.
        if (word == ''): 
            continue #Skip empty strings.
        
        word = word.upper() #.upper used here to turn everything uppercase so it's not case sensitive.

        if word in word_count: #Adjusting values in the dictionary or creating a new entry. 
            word_count[word] += 1 
        else:
            word_count[word] = 1
    return word_count

cfcurl = 'https://www.cfcunderwriting.com'     # The CFC url given in the specification that needs to be scraped.
privacy_policy_url = "" #Setting up the privacy policy url variable as a String, the data will be added seperately.

def external_check(url): #This function is used to find if a link comes internally or externally based on the url given.
    if (not url): #Returns false for empty strings.
        return False

    # Local files, denoted by local reference '/' are considered not external, as are fragment links '#' 
    # ? (fragments wont link to resources, but this may have other use cases if extended)
    if (url[0] == '/'):
        return False
    if(url[0] == '#'):
        return False

    return True

def find_attributes(tag, attribute):
    page = requests.get(cfcurl)
    soup = BeautifulSoup(page.text, 'html.parser') #Identifying all external links by using the beautifulsoup function

    allAtributes = []
    
    for item in soup.findAll(tag): #Creating a loop to go over every item in the soup with the specified tag.
        try: 
            location = item[attribute] #Attribute is for the tag which links to a resource.
            allAtributes.append(location) #Appending values to the array.
        except KeyError:
            # Item doesnt have the desired attribute
            pass

    return allAtributes

def get_external_resources():
    

    resources = []
    resources += find_attributes('img', 'src') #This doesn't return any values but is included for the specification.
    resources += find_attributes('link', 'href') #These tags were chosen based on the specification.
    resources += find_attributes('script', 'src') #Essentially getting every link for each tag.

    external_resources = [cfcurl for cfcurl in resources if external_check(cfcurl)] #Go over every link and remove any local links.
    return external_resources

def enumeration():

    
    page = requests.get(cfcurl)# Gets the page link
    soup = BeautifulSoup(page.text, 'html.parser')

    allLinks = [] 

    for link in soup.findAll('a'):
        try:
            # Get the links destination, then append to list of all links with it's text content
            link_destination = link['href']
            allLinks.append((link_destination, link.find(text=True))) #Adding All neccessary links to the array with a matching text.

        except KeyError:
            pass

    return allLinks


def search_link_text(hyperlinks, targeted_text): #Finds the exact location of privacy policy by using the key words as the targeted_text
    for (hyperlink, link_text) in hyperlinks:
        try:
            if link_text.lower() == targeted_text.lower():
                return hyperlink
        except AttributeError:
            pass
    return None


def save_json_file(file_name, data):
    # Takes file name and saves data into a JSON file.
    with open(file_name + ".json", 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        print("The data has been exported to the JSON file named:", file_name)

# 1/2: Get the external resources for the target page and save the results a JSON file
save_json_file("external_resources", get_external_resources())

# 3/4: List all links on the page, parse the words on the privacy policy page:
privacy_policy_url = cfcurl + search_link_text(enumeration(), "privacy policy")

save_json_file("word_frequency", calculate_freq()) #Running the function to find how frequently words appear and saving the output to a JSON file.