In [14]:
# Connection libraries.
import urllib.request, urllib.parse, urllib.error
import requests
from bs4 import BeautifulSoup
#from langdetect import detect
import re
import hashlib 
import time
import langdetect

import spacy
import es_core_news_sm
import en_core_web_sm
import networkx as nx

%run ./text_clustering_final.ipynb

In [15]:
# The witness class.
class TheWitness: 
    
    # Static variables.
    headers = {}
    headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    social_media_names_file = "./datasets/socialmedia_names.txt"
    
    # Initialize witness features.
    def __init__(self, origin_url, destination_url, destination_hash, 
                 distance_from_root):
        
        # Where was the agent created?
        self.agent_origin = origin_url
        self.agent_destination = destination_url
        self.agent_session = requests.session()
        self.destination_hash = destination_hash
        self.distance_from_root = distance_from_root
        
    
    # Scrap the destination website!
    def scrap(self):
        
        # Get soup object.
        soup = self.getSoupObject()
        
        # Find all links on the page!
        result = self.find_website_links(self.agent_destination, soup)
        
        internal_website_urls = result[0]
        external_website_urls = result[1]
        urls = self.join_lists(internal_website_urls, external_website_urls)
        emails_list_1 = result[2]
        social_networks_urls = result[3]
        whatsapp_cellphones = result[4]
        images_urls_1 = result[5]
        
        # Get text of the website.
        text = self.get_text(soup)
        
        # Get language of the website.
        language = self.get_language(text)
        
        if language == "en":
            nlp = spacy.load("en_core_web_sm")
            #print("en")
        elif language == "es":
            nlp = spacy.load("es_core_news_sm")
            #print("es")
            
        # Get website content type
        
        content_type = evaluate_model(text, 0.5)
        
        # Get names that appear on the website.
        names = self.get_names(text, nlp)
        
        # Get locations
        locations = self.find_locations(text, nlp)
        
        # Get coordinates of maps that appear on the website.
        iframes = self.getWebsiteIFrames(soup)
        
        # Get orgazations name's that appear on the website.
        organizations = self.find_organizations(text, nlp)
        
        # Get dates that appear on the website.
        dates = self.find_dates(text, nlp)
        
        # Get all phone numbers that appear on the website.
        phonenumbers = self.join_lists(self.getPhoneNumbers(text),
                                       whatsapp_cellphones)
        
        # Get images urls that appear on the website. 
        images_urls = self.join_lists(self.getImagesFromWebPage(self.agent_destination, download=False, secs=0), 
                                      images_urls_1)
        
        # Get payment accounts that appear on the website.
        payment_accounts = self.getPayment(text)
        
        # Get emails that appear on the website.
        emails = self.getEmails(text)
        
        return [self.destination_hash, # ID of the website.
                self.agent_destination,# URL of the website.
                content_type,          # Content Type of the website.
                language,              # Language of the website.
                urls,                  # URLS found on the website.
                social_networks_urls,  # Social networks urls.
                text,                  # Text of the website.
                names,                 # Names that appeared on the website.
                locations,             # Locations mentioned on website.
                iframes,               # Iframes code (maps or videos) that appear on the website.
                organizations,         # Organization name's.
                dates,                 # Dates on the website.
                phonenumbers,          # Phonenumbers on the website.
                images_urls,           # Images urls.
                payment_accounts,      # Payment Accounts (banks, crytpo).
                emails                 # Emails on the website.
               ]
        
    ## -----------------------
    ## Webscrapping functions.
    ## -----------------------
    """
    This functions makes a requests to the destination website and
    initialize a BeautifulSoup object for processing it.
    """
    def getSoupObject(self):
        r = self.agent_session.get(self.agent_destination, headers = TheWitness.headers)
        data = r.text
        return BeautifulSoup(data, "html.parser")
    
    
    """
    Find google map info (if there is one).
    """
    def getWebsiteIFrames(self, soup):
        info = soup.find("iframe")
        return info
    
    """
    This function saves all the images urls and downloads them optionaly
    """
    def getImagesFromWebPage(self, url, download = False, secs = 0):

        r = requests.get(url)
        data = r.text
        soup = BeautifulSoup(data, "html.parser")
        images = soup.find_all("img")
        url_images = list()
        
        for x in soup.find_all("img"):
            try:
                
                # Extra image route.
                linkImage = x.attrs["src"]
                
                # The image is on the form of an internar page, so we need to add 
                # the website url.
                if (linkImage.find("http") == -1 or linkImage.find("https") == 1):
                    linkImage = url[:-1] + x.attrs["src"]
                
                # We don't want svg files.
                if linkImage.find("svg") == -1:

                    # Save image url.
                    if linkImage not in url_images:
                        url_images.append(linkImage)
            except:
                pass
        return url_images
    
    
    """
    Downloads the images from a list of images urls.
    """
    def download_images(self, images_urls):
        for image_url in images_urls: 
            hashImage = hashlib.sha256()
            hashImage.update(linkImage.encode())
            image_name = hashImage.hexdigest()
            urllib.request.urlretrieve(image_url,"./images/" + image_name + ".jpg")
            time.sleep(secs)
    
    """
    This function detect phone numbers via regular expressions (regex)
    """
    def getPhoneNumbers(self, text):
        patternPhones1 = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
        patternPhones2 = "(\(\d{1}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{2}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4})"
        patternPhones3 = "(\([+]\d{1}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\([+]\d{2}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\([+]\d{3}\)\s*\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4})"
        match1 = re.findall(patternPhones1, text)
        match2 = re.findall(patternPhones2, text)
        match3 = re.findall(patternPhones3, text)
        return match1 + match2 + match3
    
    """
    This function detect emails via regular expressions (regex)
    """
    def getEmails(self, text):
        patternEmails ="[a-zA-Z0-9]+[a-zA-Z0-9.%\-\+]*@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,4}"
        result = re.findall(patternEmails, text) 
        return result
    
    """
    This function detects BitCoin addresses via regular expressions (regex)
    """
    def getPayment(self, text):
        patternBtc ="^([13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[ac-hj-np-zAC-HJ-NP-Z02-9]{11,71})$"
        result = re.findall(patternBtc, text) 
        return result
    
    """
    This function gets the formated text of soup element.
    """
    def get_text(self, soup):
        for script in soup(["script", "style"]):                   
            script.decompose()
        text = soup.get_text().replace("\n"," ")
        text = re.sub('\s+',' ',text)
        return text
    
    """
    This function returns the language of the page.
    """
    def get_language(self, text):
        lang = langdetect.detect(text)
        return lang

    """
    This function makes an attemp of finding person names.
    """
    def get_names(self, text, nlp):
        doc = nlp(text)
        names = list()
        for ent in doc.ents:
            if ent.label_ == "PER" or ent.label_ == "PERSON": 
                if ent.text not in names:
                    names.append(ent.text)
        return names
    
    """
    This function uses the entity labels from spacy to find locations. It also use the re 
    library to find patterns in the text that could lead into a location or address.
    """
    def find_locations(self, text, nlp):
        localidades=["Usaquén","Chapinero","Santa Fe","San Cristóbal","Usme","Tunjuelito","Bosa","Kennedy","Fontibón","Engativá",
                     "Suba","Barrios Unidos","Teusaquillo","Los Mártires","Antonio Nariño","Puente Aranda","La Candelaria",
                     "Rafael Uribe Uribe","Ciudad Bolívar","Sumapaz"]
        cardinales=["Norte","Sur","Este","Oeste","Occidente","Oriente"]
        direccion=["Calle","Avenida", "Carrera","Diagonal"]
        doc = nlp(text)
        
        list_ = list()
        for ent in doc.ents:
            try: 
                if ent.label_=="LOC" and (ent.text not in list_):
                    list_.append(ent.text)
            except: 
                pass
                
        for l in localidades:
            try: 
                locations = re.findall("("+l+")", text)[0]
                if len(locations) > 0 and (locations not in list_):
                    list_.append(locations)
            except: 
                pass

        for c in cardinales:
            try: 
                locations = re.findall("("+c+")", text)[0]
                if len(locations) > 0 and (locations not in list_):
                    list_.append(locations)
            except: 
                pass
                
        for d in direccion:
            try: 
                locations = re.findall("("+d+"[0-9]{2}\s)", text)[0]
                if len(locations) > 0 and (locations not in list_):
                    list_.append(locations)
            except:
                pass
                
        return list_
    
    """
    This function uses of entity labels from spacy to find organizations
    """
    def find_organizations(self, text, nlp):
        doc = nlp(text)
        list_ =[]
        for ent in doc.ents:
            if ent.label_=="ORG" and (ent.text not in list_):
                list_.append(ent.text)
        return list_
    
    """
    This function uses of entity labels from spacy to find dates.It also use the re library to find patterns in the text
    that could lead in to a date.
    """
    def find_dates(self, text, nlp):
        months=["Enero","Ene", "January","Jan","Febrero","February","Feb","Marzo","March","Mar","Abril","April","Mayo","May",'Junio','June', "Jun","Julio","July", "Jul",
               "Agosto","Ago","August","Aug","Septiembre",'September',"Sep",'Octubre','October',"Oct","Noviembre",'November',"Nov","Diciembre","December", "Dec"]
        doc = nlp(text)
        lista=[]
        for ent in doc.ents:
            if ent.label_=="DATE":
                #print(ent.text, ent.label_)
                lista.append(ent)
        for m in months:
            if(len(re.findall("([0-9]{2}\s"+m+"\s[0-9]{4})", text))>0):
                lista.append(re.findall("([0-9]{2}\s"+m+"\s[0-9]{4})", text))       
        
        return lista

    """
    Find all website links and related social media and phonenumbers 
    associated with urls on the page.
    Input: 
    Output:
    """
    
    # Find all the urls in the website.
    def find_website_links(self, url, soup):
        
        # Get social media names.
        f = open(TheWitness.social_media_names_file, "r")
        sm_keywords = f.read().replace("\n", "").split(" ")
        f.close()

        # Remove last / in url if exists.
        if(url[-1] == "/"):
            url = url[0: -1]

        # Initialize lists.
        internal_website_urls = list()
        external_website_urls = list()
        social_networks_urls = list()
        whatsapp_cellphones = list()
        images_urls = list()
        emails_list = list()

        for tag in soup.find_all("a"):

            try:
                # Find href attribute.
                link = tag.attrs["href"]
                
                # If http or https is contain in the link.
                if (link.find("http") != -1 or link.find("https") != -1) and link.find("javascript") == -1:

                    # If link is not an images or #
                    if link.find("jpg") == -1 and link.find("png") == -1: 

                        # Determine if link corresponds to an internal navegation page.
                        if link.find("/") == 0: 
                            new_link = url + link
                            if new_link not in internal_website_urls: 
                                internal_website_urls.append(new_link)

                        # External link.
                        else: 

                            # Check if the external link is a social network link!
                            social_network_website = False
                            for social_network_name in sm_keywords: 
                                if link.find(social_network_name) != -1: 
                                    social_network_website = True
                                    break

                            if social_network_website:

                                if (link not in social_networks_urls): 

                                    # Add link to social networks list.
                                    social_networks_urls.append(link)


                                    # If link is of the form api.whatsapp.com, get the phonenumber.
                                    if link.find("whatsapp") != -1:

                                        number = re.findall("phone=([0-9]*)", link)
                                        if number[0] not in whatsapp_cellphones: 
                                            whatsapp_cellphones.append(number[0])     

                            # If link is not a social network link, just add it to website_urls list.
                            else: 

                                # We have an email!
                                if link.find("mailto:") != -1:
                                    if link not in emails_list: 
                                        emails_list.append(link)
                                elif link not in external_website_urls: 
                                    external_website_urls.append(link)

                    # If the link is an image.
                    elif link.find("jpg") != -1 or link.find("png") != -1:
                        if link not in images_urls: 
                            images_urls.append(link)

            except: 
                pass

        return internal_website_urls, external_website_urls, emails_list, social_networks_urls, whatsapp_cellphones, images_urls

    ## -------------------
    ## Auxiliar functions.
    ## -------------------

    """
    This method joins to python's lists.
    """   
    def join_lists(self, list1, list2): 
        new_list = list()
        for element in list1: 
            new_list.append(element)
        for element in list2: 
            new_list.append(element)
        return new_list

In [16]:
root = "https://es.cam4.com/"
witness = TheWitness(root, root, 1, 3)
result = witness.scrap()

In [17]:
result[2]

[(0, 0.25), (1, 0.25), (2, 0.25), (3, 0.25)]

In [81]:
parameters = """ Destination_Hash Agent_Destination URLS Social_Networks_URLS Language Text 
                 Names Locations Coordinates Organizations Dates PhoneNumbers ImagesURLS PaymentAccounts Emails
             """

In [82]:
lista_par = parameters.replace("\n", "").split()

In [83]:
def printFormat(self, list_):
    for i in range(len(list_)): 
        print("--------")
        print(lista_par[i])
        if type(result[i]) == list:
            for element in result[i]:
                print(element)
        else: 
            print(result[i])
        print("\n")

--------
Destination_Hash
1


--------
Agent_Destination
https://www.elespectador.com/noticias/investigacion/mario-paciolla-el-costo-de-la-caida-de-un-ministro/


--------
URLS
https://www.elespectador.com/deportes/partidos-de-futbol-hoy-en-vivo/?utm_source=elespectador.com&utm_medium=referral&utm_campaign=Boton-Parrilla-Header
https://blogs.elespectador.com/
http://suscripciones.elespectador.com/
http://foros.elespectador.com/
http://judiciales.elespectador.com/
http://bibo.elespectador.com/
https://www.caracolnext.com/
https://www.caracoltv.com/
https://www.noticiascaracol.com
https://www.golcaracol.com
https://www.caracolplay.com
https://www.caracoltvcorporativo.com/
https://www.bluradio.com/
https://www.lakalle.com
https://hjck.com/
https://www.shock.co/
https://volkgames.com/
https://www.iabcolombia.com/
https://ami.org.co/
https://www.sic.gov.co/


--------
Social_Networks_URLS
https://www.facebook.com/elespectadorcom
https://twitter.com/elespectador
https://www.youtube.com/user/



--------
Names
Mario Paciolla
Claudia Julieta Duque
”Nathalie Handal
Giuseppe Paciolla
Guillermo Botero
Misión
Rogelio Bolívar Córdova
Gildardo el Cucho
Mario Paciolla?Con
Herner Evelio Carreño
Verificación
U Roy Barreras
Botero
Fuentes
Carlos Ruiz Massieu
Rosende
Barreras
Dimar Torres
Ruiz Massieu
Iván Duque
Paz del Senado
Liliana Garavito
Farhan Haq
António Guterres
Paciolla
Juvenal Díaz Mateus
Díaz
Francesc Claret
Raúl Rosende
Jean Arnault
podrían comprometer
Cortés Reyes
Yhon Medina Vivanco
Garantías de Seguridad
Fiscalía
Maurizio Salvi
Christian Thompson
Mario PaciollaGermán Romero
Hernán Giraldo
Seguros Sura
Hidroituango


--------
Locations
ESPECTADOR
Nápoles
Francia
Italia
Colombia
San Vicente del Caguán
Caquetá
Aguas Claras
Cucho
Puerto Rico
Paciolla
Misión
Ministerio
Catatumbo
Además
Nueva York
de Colombia
Antioquia
Medellín
Bogotá
La Habana
La Repubblica
Ejército Christian Leonardo Thompson Garzón
Villa Ferro
Estarán
Sierra Nevada Investigación30 Aug
Salvatore Mancuso
Térm