## WebScrapping.

In [19]:
# Connection libraries.
import urllib.request, urllib.parse, urllib.error
import requests
from bs4 import BeautifulSoup
#from langdetect import detect
import re
import hashlib 
import time
import langdetect

import spacy
import es_core_news_sm
import en_core_web_sm

In [44]:
# The witness class.
class TheWitness: 
    
    # Static variables.
    headers = {}
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    social_media_names_file = "./datasets/socialmedia_names.txt"
    
    # Initialize witness features.
    def __init__(self, origin_url, destination_url, destination_hash, 
                 distance_from_root):
        
        # Where was the agent created?
        self.agent_origin = origin_url
        self.agent_destination = destination_url
        self.agent_session = requests.session()
        self.destination_hash = destination_hash
        self.distance_from_root = distance_from_root
        
    
    # Scrap the destination website!
    def scrap(self):
        
        # Get soup object.
        soup = self.getSoupObject()
        
        # Find all links on the page!
        result = self.find_website_links(self.agent_destination, soup)
        
        internal_website_urls = result[0]
        external_website_urls = result[1]
        urls = self.join_lists(internal_website_urls, external_website_urls)
        emails_list_1 = result[2]
        social_networks_urls = result[3]
        whatsapp_cellphones = result[4]
        images_urls_1 = result[5]
        
        # Get text of the website.
        text = self.get_text(soup)
        
        # Get language of the website.
        language = self.get_language(text)
        
        if language == "en":
            nlp = spacy.load("en_core_web_sm")
            print("en")
        elif language == "es":
            nlp = spacy.load("es_core_news_sm")
            print("es")
        
        # Get names that appear on the website.
        names = self.get_names(text, nlp)
        
        # Get locations
        
        locations = self.find_locations(text)
        
        # Get coordinates of maps that appear on the website.
        coordinates = []
        
        # Get orgazations name's that appear on the website.
        organizations = []
        
        # Get dates that appear on the website.
        dates = []
        
        # Get all phone numbers that appear on the website.
        phonenumbers = self.getPhoneNumbers(text)
        
        # Get images urls that appear on the website. 
        images_urls = self.getImagesFromWebPage(url, download=False, secs=0)
        
        # Get payment accounts that appear on the website.
        payment_accounts = self.getPayment(text)
        
        # Get emails that appear on the website.
        emails = self.getEmails(text)
        
        return [self.destination_hash, # ID of the website.
                self.agent_destination, # URL of the website.
                urls,                  # URLS found on the website.
                social_networks_urls,  # Social networks urls.
                language,              # Language of the website.
                text,                  # Text of the website.
                names,                 # Names that appeared on the website.
                locations,             # Locations mentioned on website.
                coordinates,           # Coordinates of maps that appear on the website.
                organizations,         # Organization name's.
                dates,                 # Dates on the website.
                phonenumbers,          # Phonenumbers on the website.
                images_urls,           # Images urls.
                payment_accounts,      # Payment Accounts (banks, crytpo).
                emails                 # Emails on the website.
               ]
        
    ## -----------------------
    ## Webscrapping functions.
    ## -----------------------
    
    
    
    """
    This function saves all the images urls and downloads them optionaly
    """
    def getImagesFromWebPage(self, url, download=False, secs=0 ):

        r = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
        images=soup.find_all("img")
        urlImages=[]
        for x in soup.find_all("img"):
            try:
                linkImage=url[:-1]+x.attrs["src"]
                hashImage=hashlib.sha256()
                hashImage.update(linkImage.encode())
                urlImages.append([linkImage,hashImage.hexdigest()])
                if download:

                    urllib.request.urlretrieve(linkImage,"./images/"+hashImage.hexdigest()+".jpg")
                    time.sleep(secs)
            except:
                pass
        return urlImages
     
    """
    This function detect phone numbers via regular expressions (regex)
    """
    def getPhoneNumbers(self, text):
        patternPhones1 = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
        patternPhones2 = "(\(\d{1}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{2}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4})"
        patternPhones3 = "(\([+]\d{1}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\([+]\d{2}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\([+]\d{3}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4})"
        match1 = re.findall(patternPhones1, text)
        match2 = re.findall(patternPhones2, text)
        match3 = re.findall(patternPhones3, text)
        return match1+match2+match3
    
    """
    This function detect emails via regular expressions (regex)
    """
    def getEmails(self, text):
        
        patternEmails ="[a-zA-Z0-9]+[a-zA-Z0-9.%\-\+]*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,4}"

        result = re.findall(patternEmails, text) 

        return(result)
    
    """
    This function detect BitCoin adress via regular expressions (regex)
    """
    def getPayment(self, text):
        
        patternBtc ="^([13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[ac-hj-np-zAC-HJ-NP-Z02-9]{11,71})$"

        result = re.findall(patternBtc, text) 

        print(result)
    
    
    """
    This functions makes a requests to the destination website and
    initialize a BeautifulSoup object for processing it.
    """
    def getSoupObject(self):
        r = self.agent_session.get(self.agent_destination, headers = TheWitness.headers)
        data = r.text
        return BeautifulSoup(data, "html.parser")
    
    
    """
    This function gets the formated text of soup element.
    """
    
    def get_text(self, soup):
        for script in soup(["script", "style"]):                   
            script.decompose()
        text = soup.get_text().replace("\n"," ")
        text = re.sub('\s+',' ',text)
        return text
    
    """
    This function returns the language of the page.
    """
    
    def get_language(self, text):
        lang = langdetect.detect(text)
#         lang = langs[langs["ISO Code"] == lang]["Language"]
        return lang

    
    """
    This function makes an attemp of finding person names.
    """
    
    def get_names(self, text, nlp):
        doc = nlp(text)
        person = []
        for ent in doc.ents:
            if ent.label_ == "PER":
                person.append(ent)
        return person
    
    """
    This function uses the entity labels from spacy to find locations. It also use the re library to find patterns in the text
    that could lead in to a location or address
    """
    def find_locations(self, text):
        localidades=["Usaquén","Chapinero","Santa Fe","San Cristóbal","Usme","Tunjuelito","Bosa","Kennedy","Fontibón","Engativá",
                     "Suba","Barrios Unidos","Teusaquillo","Los Mártires","Antonio Nariño","Puente Aranda","La Candelaria",
                     "Rafael Uribe Uribe","Ciudad Bolívar","Sumapaz"]
        cardinales=["Norte","Sur","Este","Oeste","Occidente","Oriente"]
        direccion=["Calle","Avenida", "Carrera","Diagonal"]
        doc = nlp(text)
        list=[]
        for ent in doc.ents:
            if ent.label_=="LOC":
                print(ent.text, ent.label_)
                list.append(ent)
        for l in localidades:
            if(len(re.findall("("+l+")", text))>0):
                lista.append(re.findall("("+l+")", text), text)
        for c in cardinales:
            if(len(re.findall("("+c+")", text))>0):
                lista.append(re.findall("("+c+")", text), text))
        for d in direccion:
            if(len(re.findall("("+d+"[0-9]{2}\s)", text))>0):
                lista.append(re.findall("("+d+"[0-9]{2}\s)", text), text)) 
        return list
    
    """
    This function uses of entity labels from spacy to find organizations
    """
    def find_organizations(text):
        doc = nlp(text)
        list=[]
        for ent in doc.ents:
            if ent.label_=="ORG":
                print(ent.text, ent.label_)
                list.append(ent)
            
        return list
    
    """
    This function uses of entity labels from spacy to find dates.It also use the re library to find patterns in the text
    that could lead in to a date.
    """
    def find_dates(text):
        months=["Enero","Ene", "January","Jan","Febrero","February","Feb","Marzo","March","Mar","Abril","April","Mayo","May",'Junio','June', "Jun","Julio","July", "Jul",
               "Agosto","Ago","August","Aug","Septiembre",'September',"Sep",'Octubre','October',"Oct","Noviembre",'November',"Nov","Diciembre","December", "Dec"]
        doc = nlp(text)
        lista=[]
        for ent in doc.ents:
            if ent.label_=="DATE":
                print(ent.text, ent.label_)
                lista.append(ent)
        for m in months:
            if(len(re.findall("([0-9]{2}\s"+m+"\s[0-9]{4})", text))>0):
                lista.append(re.findall("([0-9]{2}\s"+m+"\s[0-9]{4})", text))       
        
        return lista

    """
    Find all website links and related social media and phonenumbers 
    associated with urls on the page.
    Input: 
    Output:
    """
    
    # Find all the urls in the website.
    def find_website_links(self, url, soup):
        
        # Get social media names.
        f = open(TheWitness.social_media_names_file, "r")
        sm_keywords = f.read().replace("\n", "").split(" ")
        f.close()

        # Remove last / in url if exists.
        if(url[-1] == "/"):
            url = url[0: -1]

        # Initialize lists.
        internal_website_urls = list()
        external_website_urls = list()
        social_networks_urls = list()
        whatsapp_cellphones = list()
        images_urls = list()
        emails_list = list()

        for tag in soup.find_all("a"):

            try:
                # Find href attribute.
                link = tag.attrs["href"]


                # If link is not an images or #
                if link.find("#") == -1 and link.find("jpg") == -1 and link.find("png") == -1: 

                    # Determine if link corresponds to an internal navegation page.
                    if link.find("/") == 0: 
                        new_link = url + link
                        if new_link not in internal_website_urls: 
                            internal_website_urls.append(new_link)

                    # External link.
                    else: 

                        # Check if the external link is a social network link!
                        social_network_website = False
                        for social_network_name in sm_keywords: 
                            if link.find(social_network_name) != -1: 
                                social_network_website = True
                                break

                        if social_network_website:

                            if (link not in social_networks_urls): 

                                # Add link to social networks list.
                                social_networks_urls.append(link)


                                # If link is of the form api.whatsapp.com, get the phonenumber.
                                if link.find("whatsapp") != -1:

                                    number = re.findall("phone=([0-9]*)", link)
                                    if number[0] not in whatsapp_cellphones: 
                                        whatsapp_cellphones.append(number[0])     

                        # If link is not a social network link, just add it to website_urls list.
                        else: 

                            # We have an email!
                            if link.find("mailto:") != -1:
                                if link not in emails_list: 
                                    emails_list.append(link)
                            elif link not in external_website_urls: 
                                external_website_urls.append(link)

                # If the link is an image.
                elif link.find("jpg") != -1 or link.find("png") != -1:
                    if link not in images_urls: 
                        images_urls.append(link)

            except: 
                pass

        return internal_website_urls, external_website_urls, emails_list, social_networks_urls, whatsapp_cellphones, images_urls

    ## -------------------
    ## Auxiliar functions.
    ## -------------------

    """
    This method joins to python's lists.
    """   
    def join_lists(self, list1, list2): 
        new_list = list()
        for element in list1: 
            new_list.append(element)
        for element in list2: 
            new_list.append(element)
        return new_list

In [45]:
agent = TheWitness("https://dentalgomez.co/", "https://dentalgomez.co/", destination_hash = "srgh2314", distance_from_root = 1)

In [46]:
# for element in agent.scrap(): 
#     print(element)
print(agent.scrap()[6])

es
[Dental Gómez, Pago Múltiples, Instagram Mantente, Correo Contáctate]


In [23]:
# Omit pages.
omit_pages = ["google", "linkto", "odoo"]

In [24]:
# Global variables.
hashes_map = {}

In [25]:
root = "https://modelosalacarta.com/"
destination = "https://modelosalacarta.com/"
hash_destination = nodeID(destination)
distance_from_root = 0
hashes_map[destination] = True

witness = TheWitness(
    root, 
    destination, 
    hash_destination, 
    distance_from_root
)
result = witness.scrap()

NameError: name 'nodeID' is not defined

In [None]:
result

In [None]:
# For link in links.
def explore_network(result):
    
    count = 1
    for link in result[2]: 

        
        # We already explored this website.
        try: 
            if hashes_map[link]: 
                pass

        # New website!
        except:
            
            navegable = True
            for website in omit_pages: 
                if link.find(website) != -1: 
                    navegable = False
                    

            if navegable: 
                root = "https://modelosalacarta.com/"
                destination = link
                hash_destination = nodeID(destination)
                distance_from_root = 1
                #hashes_map[destination] = True

                witness = TheWitness(
                    root, 
                    destination, 
                    hash_destination, 
                    distance_from_root
                )
                result = witness.scrap()
                
                print("---------------")
                print("Agent No.", count)
                print(result)
                print("---------------")
                
                del witness
                
        count += 1

In [None]:
explore_network(result)

In [None]:
result = witness.scrap()

In [None]:
result[2]

In [None]:
#url = "https://co.mileroticos.com"
url = "https://dentalgomez.co/"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, "html.parser")

In [None]:
def nodeID(url): 
    m = hashlib.sha256()
    m.update(url.encode())
    return m.hexdigest()

In [None]:
# Find all the urls in the website.
def find_links(url, soup):
    
    # Get social media names.
    f = open("./datasets/socialmedia_names.txt", "r")
    sm_keywords = f.read().replace("\n", "").split(" ")
    f.close()

    # Remove last / in url if exists.
    if(url[-1] == "/"):
        url = url[0: -1]

    # Initialize lists.
    internal_website_urls = list()
    external_website_urls = list()
    social_networks_urls = list()
    whatsapp_cellphones = list()
    images_urls = list()
    emails_list = list()

    for tag in soup.find_all("a"):

        try:
            # Find href attribute.
            link = tag.attrs["href"]
            

            # If link is not an images or #
            if link.find("#") == -1 and link.find("jpg") == -1 and link.find("png") == -1: 

                # Determine if link corresponds to an internal navegation page.
                if link.find("/") == 0: 
                    new_link = url + link
                    if new_link not in internal_website_urls: 
                        internal_website_urls.append(new_link)

                # External link.
                else: 

                    print("External link: ", link)
                    
                    # Check if the external link is a social network link!
                    social_network_website = False
                    for social_network_name in sm_keywords: 
                        if link.find(social_network_name) != -1: 
                            social_network_website = True
                            break

                    if social_network_website:
                    
                        if (link not in social_networks_urls): 

                            # Add link to social networks list.
                            social_networks_urls.append(link)


                            # If link is of the form api.whatsapp.com, get the phonenumber.
                            if link.find("whatsapp") != -1:

                                number = re.findall("phone=([0-9]*)", link)
                                if number[0] not in whatsapp_cellphones: 
                                    whatsapp_cellphones.append(number[0])     

                    # If link is not a social network link, just add it to website_urls list.
                    else: 
                        
                        # We have an email!
                        if link.find("mailto:") != -1:
                            if link not in emails_list: 
                                emails_list.append(link)
                        elif link not in external_website_urls: 
                            external_website_urls.append(link)
            
            # If the link is an image.
            elif link.find("jpg") != -1 or link.find("png") != -1:
                if link not in images_urls: 
                    images_urls.append(link)
                
        except: 
            pass

    return internal_website_urls, external_website_urls, emails_list, social_networks_urls, whatsapp_cellphones, images_urls

In [None]:
a = find_links(url, soup)
a[1]

In [None]:
url

In [None]:
nodeID(url)

In [None]:
internal_links, external_links = internal_external_links(url, urls)

In [None]:
internal_links

In [None]:
external_links

In [None]:
social_networks

In [None]:
wa_cellphones