## Web Crawler de criticas de filmes

In [1]:
from bs4 import BeautifulSoup
from bs4.dammit import EncodingDetector
import urllib.robotparser as robotparser
from urllib.request import urlopen
import requests as rq
from collections import deque
import re
import time
import random
from random import randrange
import json
import ssl
import selenium
from selenium.webdriver.common.keys import Keys
from sklearn.externals import joblib

### Sites

In [2]:
sites = [ ('Pocilga','pocilga.com.br/'), ('Cinema_com_Rapadura','cinemacomrapadura.com.br/'),('Cineclick','www.cineclick.com.br/'), 
('Plano_Critico','www.planocritico.com/'),('Plano_Aberto','www.planoaberto.com.br/'),
('Cinemasim','www.cinemasim.com.br/'),('Omelete','www.omelete.com.br/'),('Jornal_da_Paraiba','www.jornaldaparaiba.com.br/'),
('Cinema_em_Cena','cinemaemcena.cartacapital.com.br/'), ('Elpais','brasil.elpais.com/')]

## Coletando paginas


#### Função que trata caso de pagina com load infinito (https://www.omelete.com.br/criticas)

In [3]:
def infinity_page(page_link):
    driver = selenium.webdriver.Firefox(executable_path='./geckodriver/geckodriver')
    driver.get("https://"+page_link)
    SCROLL_PAUSE_TIME = 1   
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
       
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        time.sleep(SCROLL_PAUSE_TIME)
        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            try:
                driver.find_element_by_id("loadMore").click()
                last_height = new_height
                continue
            except NoSuchElementException:
                break
        last_height = new_height
    
    return driver.page_source 

### 1.BFS

In [4]:
def get_links_bfs(page):
            
    home_page =  page[1]
    print(home_page)
    ##Carrega o robots.txt
    rp = robotparser.RobotFileParser()
    try:
        rp.set_url("https://" +home_page +"robots.txt")
        rp.read()
    except ssl.CertificateError as e:
        rp.set_url("http://" +home_page +"robots.txt")
        rp.read() 
    
    headers = {'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5);'}
    #arquivos com backup de paginas visitas e fila de paginas
    with open('lista_paginas.json') as f:
        lista_paginas = json.load(f)
    with open('fila_paginas.json') as f:
        fila_paginas_json = json.load(f)
    
    if len(lista_paginas) == 0:
        lista_paginas = [home_page]
    if len(fila_paginas_json):
        fila_paginas = deque(fila_paginas_json)
        fila_aux = deque(fila_paginas_json)        
    else:
        fila_paginas = deque([home_page])
        fila_aux = deque([home_page])   
    #quantidade de paginas salvas
    cont = 0
    index_page = 0
    while cont < 1000:
        page_link = fila_paginas.popleft()
                      
        try:
            resp = rq.get("https://"+page_link, headers=headers, allow_redirects=False)
        except rq.exceptions.SSLError:
            resp = rq.get("http://"+page_link, headers=headers, allow_redirects=False)
        
        http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
        html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
        encoding = html_encoding or http_encoding
        soup = BeautifulSoup(resp.content, from_encoding=encoding)
        links = []
        #verifica se a pagina veio vazia
        if not (resp.content):
            continue
        pattern = re.compile("^(/|http)")
        for link in soup.find_all("a", href=pattern):
            href = link.get('href')
            if href.endswith(".xml") or href.endswith(".jpg") or href.endswith(".png") or href.endswith(".pdf") or href.endswith(".jpeg"):
                continue
            if "#" in href:
                continue
            if href.startswith("//"):
                href = href[2:] 
            if href.startswith("/"):
                href = home_page + href[1:]                
            href = href.replace("https://", "")
            href = href.replace("http://", "")            
            if not (href.startswith(home_page)):
                continue                               
            if href not in lista_paginas:                    
                if cont < 1000:
                    #adiciona na fila se não houver problemas com o robots.txt ou pagina = cinema em cena -não possui robots.txt
                    if rp.can_fetch("*", href) or (page[0] == 'Cinema_em_Cena'):
                        lista_paginas.append(href)
                        fila_paginas.append(href)
                        fila_aux.append(href)
        x = fila_aux.popleft()
        cont = cont+1 
        
        index_page = index_page + 1
        page_name = "page_"+str(index_page)+".html"
        with open("pages/bfs/"+page[0]+"/" + page_name, 'wb') as f:
            f.write(resp.content)
        with open('pages/bfs/'+page[0]+"/pages.json") as f:
            link_number = json.load(f)
        link_number.append((page_link,page_name))                    
        with open('pages/bfs/'+page[0]+"/pages.json", 'w') as outfile:
            json.dump(link_number, outfile)
        
        if (cont % 10) == 0:
            time.sleep(randrange(3))
        with open('lista_paginas.json', 'w') as outfile:
            json.dump(lista_paginas, outfile)
        with open('fila_paginas.json', 'w') as outfile:
            json.dump(list(fila_aux), outfile)
        with open('cont.json', 'w') as outfile:
            json.dump(cont, outfile)
    

# with open('lista_paginas.json', 'w') as outfile:
#     json.dump([], outfile)
# with open('fila_paginas.json', 'w') as outfile:
#     json.dump([], outfile)
    
    
# get_links_bfs(sites[9])



### 2. Heurística 1

In [5]:
good_words = ["critica","resenha"]

ok_words = ["cultura", "filme","cinema"] 

bad_words = ["noticia", "serie", "lista","eventos","quadrinhos",
             "temporada","album", "literatura", "televisao", "musica", "podcast", "sinopse"]

index_words = ["/page/", "/index/","page=","/a/"]

def get_link_score(href,home_page):
    
    link = href.replace(home_page, "")
    for word in bad_words:
        if word in link:
            return -1
    for word in good_words:
        if word in link:
            for word in index_words:
                if word in link:
                    return 2
            return 3
    for word in ok_words:
        if word in link:
            return 1
    
    return 0


def get_links_heuristic(page):
     
    home_page =  page[1]
    print(home_page)
    ##Carrega o robots.txt
    rp = robotparser.RobotFileParser()
    try:
        rp.set_url("https://" +home_page +"robots.txt")
        rp.read()
    except ssl.CertificateError as e:
        rp.set_url("http://" +home_page +"robots.txt")
        rp.read()
        
    
    headers = {'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5);'}
    #arquivos com backup de paginas visitas e fila de paginas
    with open('lista_paginas.json') as f:
        lista_paginas = json.load(f)
    with open('fila_paginas.json') as f:
        fila_paginas_json = json.load(f)
    with open('cont.json') as f:
        cont = json.load(f)
    
    if len(lista_paginas) == 0:
        lista_paginas = [home_page]
    if len(fila_paginas_json):
        page_ranking = fila_paginas_json
        fila_aux = fila_paginas_json        
    else:
        page_ranking = [(home_page,1)]
        fila_aux = [(home_page,1)]   
    #quantidade de paginas salvas
    last_cont = 0
    index_page = 0
    while cont < 1000:
        pagina = page_ranking.pop(0)
        page_link = pagina[0]
        page_score = pagina[1]
        
        
        if(page_link == "www.omelete.com.br/criticas"):
            print("entrou aqui")
            soup = BeautifulSoup(open("pages/criticas-omelete.html"), "html.parser")            
        else:
            try:
                resp = rq.get("https://"+page_link, headers=headers, allow_redirects=False)
            except rq.exceptions.SSLError:
                resp = rq.get("http://"+page_link, headers=headers, allow_redirects=False)
            
            http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
            html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
            encoding = html_encoding or http_encoding
            soup = BeautifulSoup(resp.content, from_encoding=encoding)
        
        
        links = []
        #verifica se a pagina veio vazia        
        x = fila_aux.pop(0)
        if not (resp.content):
            continue 
        
        pattern = re.compile("^(/|http)")
        for link in soup.find_all("a", href=pattern):
            href = link.get('href')
            text = link.string
            if href.endswith(".xml") or href.endswith(".jpg") or href.endswith(".png") or href.endswith(".pdf") or href.endswith(".jpeg"):
                continue            
            if "#" in href:
                continue
            if href.startswith("//"):
                href = href[2:] 
            if href.startswith("/"):
                href = home_page + href[1:]                
            href = href.replace("https://", "")
            href = href.replace("http://", "")
           
            if not (href.startswith(home_page)):
                continue                               
            if href not in lista_paginas:                    
                if cont < 1000:
                    #adiciona na fila se não houver problemas com o robots.txt ou pagina = cinema em cena -não possui robots.txt
                    if rp.can_fetch("*", href) or (page[0] == 'Cinema_em_Cena'):
                        score = get_link_score(href,home_page)                        
                        page_ranking.append((href,score))
                        page_ranking.sort(key=lambda x: x[1], reverse=True)
                        #print(page_ranking)
                        fila_aux = page_ranking
            lista_paginas.append(href)            
                        
        cont = cont+1 
        if (page_score >= 2): 
            index_page = index_page + 1
            page_name = "page_"+str(index_page)+".html"
            with open("pages/heuristic/"+page[0]+"/" + page_name, 'wb') as f:
                f.write(resp.content)
            with open('pages/heuristic/'+page[0]+"/pages.json") as f:
                link_number = json.load(f)
            link_number.append((page_link,page_name))                    
            with open('pages/heuristic/'+page[0]+"/pages.json", 'w') as outfile:
                json.dump(link_number, outfile)
        
       
        if (cont % 10) == 0:
            time.sleep(randrange(3))
        with open('lista_paginas.json', 'w') as outfile:
            json.dump(lista_paginas, outfile)
        with open('fila_paginas.json', 'w') as outfile:
            json.dump(fila_aux, outfile)
        with open('cont.json', 'w') as outfile:
            json.dump(cont, outfile)
    

# with open('lista_paginas.json', 'w') as outfile:
#     json.dump([], outfile)
# with open('fila_paginas.json', 'w') as outfile:
#     json.dump([], outfile)
# with open('cont.json', 'w') as outfile:
#     json.dump(0, outfile)

# get_links_heuristic(sites[6])


### 3. Heuristica 2

In [6]:
good_words = ["critica","resenha", "/cultura/cinema/"] 
import ssl
ok_words = ["cultura", "filme","cinema","cine"] 

bad_words = ["noticia", "serie", "lista","eventos","quadrinhos","governo", "tag/discos/","share=","vida_urbana","colunas/",
             "politica","actualidad", "cultura/agenda", "tag/fecha/","economia","internacional","esportes/","tv/",
             "temporada","album", "literatura", "televisao","critica_literaria/","libros","premios","resenha-de-livro",
             "musica", "podcast", "sinopse", "clipe","critica_arte/", "autor/", "emmy","tag/game_of_thrones" ] 

index_words = ["/page/", "/index/","page=","/a/"]

def get_link_score_2(href, text, home_page, pai_score):
    regexp = re.compile('[0-9]x[0-9]')
    
    link = href.replace(home_page, "")
    if regexp.search(link):
        return -1    
    for word in bad_words:
        if word in link:
            return -1
    for word in good_words:
        if word in link:            
            for word in index_words:
                if word in link:
                    #print(href)
                    return 3
            return 10
    for word in ok_words:
        if word in link:
            if pai_score == 3:
                return 4
            return 1
    if pai_score == 3:
                #print(href)
                return 2
    return 0


def get_links_heuristic_2(page):
     
    home_page =  page[1]
    
    ##Carrega o robots.txt
    rp = robotparser.RobotFileParser()
    try:
        rp.set_url("http://" +home_page +"robots.txt")
        rp.read()
    except ssl.CertificateError as e:
        rp.set_url("https://" +home_page +"robots.txt")
        rp.read()
    print(home_page)    
    
    headers = {'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5);'}
    #arquivos com backup de paginas visitas e fila de paginas
    with open('lista_paginas.json') as f:
        lista_paginas = json.load(f)
    with open('fila_paginas.json') as f:
        fila_paginas_json = json.load(f)
    with open('cont.json') as f:
        cont = json.load(f)
    
    if len(lista_paginas) == 0:
        lista_paginas = [home_page]
    if len(fila_paginas_json):
        page_ranking = fila_paginas_json
        fila_aux = fila_paginas_json        
    else:
        page_ranking = [(home_page,1)]
        fila_aux = [(home_page,1)]   
    #quantidade de paginas salvas
    last_cont = cont
    index_page = 0
    while cont < 1000:
        pagina = page_ranking.pop(0)
        page_link = pagina[0] 
        page_score = pagina[1]
        #print(page_link, page_score)
        if(page_link == "www.omelete.com.br/criticas"):
            print("entrou aqui")
            soup = BeautifulSoup(open("pages/criticas-omelete.html"), "html.parser")            
        else:
            try:
                resp = rq.get("https://"+page_link, headers=headers, allow_redirects=False)
            except rq.exceptions.SSLError:
                resp = rq.get("http://"+page_link, headers=headers, allow_redirects=False)
            
            http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None
            html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True)
            encoding = html_encoding or http_encoding
            soup = BeautifulSoup(resp.content, from_encoding=encoding)
        links = []
        #verifica se a pagina veio vazia        
        x = fila_aux.pop(0)
        if not (resp.content):
            continue 
        
        pattern = re.compile("^(/|http)")
        for link in soup.find_all("a", href=pattern):
            href = link.get('href')
            
            text = link.string
            if href.endswith(".xml") or href.endswith(".jpg") or href.endswith(".png") or href.endswith(".pdf") or href.endswith(".jpeg"):
                continue            
            if "#" in href:
                continue
            if href.startswith("//"):
                href = href[2:] 
            if href.startswith("/"):
                href = home_page + href[1:]                
            href = href.replace("https://", "")
            href = href.replace("http://", "")
            
            if not (href.startswith(home_page)):
                continue
           
            if href not in lista_paginas:                
                if cont < 1000:
                    #print(href)
                    #adiciona na fila se não houver problemas com o robots.txt ou pagina = cinema em cena -não possui robots.txt
                    if rp.can_fetch("*", href) or (page[0] == 'Cinema_em_Cena'):
                        
                        score = get_link_score_2(href,text, home_page, page_score)                        
                        page_ranking.append((href,score))
                        page_ranking.sort(key=lambda x: x[1], reverse=True)                        
                        
                        fila_aux.append((href,score))
                        fila_aux.sort(key=lambda x: x[1], reverse=True)
            lista_paginas.append(href)            
            
        cont = cont+1 
        if (page_score >= 2 and page_score != 3):
            last_cont = cont
            index_page = index_page + 1
            page_name = "page_"+str(index_page)+".html"
            with open("pages/heuristic_2/"+page[0]+"/" + page_name, 'wb') as f:
                f.write(resp.content)
            with open('pages/heuristic_2/'+page[0]+"/pages.json") as f:
                link_number = json.load(f)
            link_number.append((page_link,page_name))                    
            with open('pages/heuristic_2/'+page[0]+"/pages.json", 'w') as outfile:
                json.dump(link_number, outfile)
        
        if (cont % 10) == 0:
            time.sleep(randrange(3))
        with open('lista_paginas.json', 'w') as outfile:
            json.dump(lista_paginas, outfile)
        with open('fila_paginas.json', 'w') as outfile:
            json.dump(fila_aux, outfile)
        with open('cont.json', 'w') as outfile:
            json.dump(cont, outfile)
            
        if(cont - last_cont) > 30:
            print("muito tempo ocioso")
            break
        if(len(page_ranking) == 0):
            print("fila vazia")
            break
    

# with open('lista_paginas.json', 'w') as outfile:
#     json.dump([], outfile)
# with open('fila_paginas.json', 'w') as outfile:
#     json.dump([], outfile)
# with open('cont.json', 'w') as outfile:
#     json.dump(0, outfile)

# get_links_heuristic_2(sites[9])


In [7]:
# pag = sites[9]

# with open("pages/bfs/"+pag[0]+"/" +"pages.json") as f:
#     bfs_pages = json.load(f)
# with open("pages/heuristic/"+pag[0]+"/" +"pages.json") as f:
#     heuristic_pages = json.load(f)
# with open("pages/heuristic_2/"+pag[0]+"/" +"pages.json") as f:
#     heuristic_2_pages = json.load(f)

# unique_pages = []

# for p in bfs_pages:
#     unique_pages.append(p[0])
# for p in heuristic_pages:
#     if p[0] not in unique_pages:
#         unique_pages.append(p[0])
# for p in heuristic_2_pages:
#     if p[0] not in unique_pages:
#         unique_pages.append(p[0])
# len(unique_pages)
# #unique_pages
# with open("pages/unique_"+pag[0]+".json", 'w') as outfile:
#     json.dump(unique_pages, outfile)

### Usando classificador

In [9]:
import os
import re
import unicodedata
import random
from enum import Enum

import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

from sklearn.externals import joblib

from sklearn.exceptions import UndefinedMetricWarning
import warnings

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

class ETokenType(Enum):
    """
    Enumerable class with all token's types.
    Update this enum every time a new regex group is added to WordTokenizer._token_pattern
    The order of the values must match with WordTokenizer._token_pattern regexes' order
    """
    EMAIL = 0
    URL = 1
    GLUED_TITLES = 2
    GLUED_WORD = 3
    GLUED_LOWER = 4
    TELEPHONE_CEP = 5
    VALUE = 6
    DATE = 7
    GLUED_VALUE = 8
    WORD = 9
    NON_WORD = 10

def read_corpus(corpus_dir, lang, ignored_token_types=[], min_token_size=2):
    """
    Read html files from the received directory.

    :param corpus_dir: corpus directory
    :return: {doc_name:[doc_terms]}
    """

    tokenizer = WordTokenizer(lang, remove_stopwords=True, lower_case=True, do_stemming=True)

    corpus = {}
    classes = []
    for path, subdirs, files in os.walk(corpus_dir):
        if subdirs:
            classes = subdirs

        cur_class = ''
        for class_ in classes:
            if class_ in path:
                cur_class = class_

        for file in files:
            try:
                html = open(path + '/' + file, mode='r', encoding='utf-8').read()
            except UnicodeDecodeError:
                print('Error reading file:', file)
                continue

            corpus[cur_class + '_' + file] = tokenizer.tokenize(
                html,
                ignored_token_types=ignored_token_types,
                min_token_size=min_token_size
            )

    print('Corpus loaded, document count:', len(corpus))

    return corpus



class WordTokenizer(object):

    # _token_pattern holds its state across instances of WordTokenizer
    # Every time a new regex group is added to _token_pattern, ETokenType must be updated
    # The order of the regexes' order must match with ETokenType values' order
    _token_pattern = r"""(?x)           # Set flag to allow verbose regexps
        ([\w\.-]+@[\w\.-]+(?:\.[\w]+)+) # E-mail regex
        | (                             # URL regex
            (?:http(?:s)?(?::)?(?:\\\\)?)?  # Optional http or https followed by optional : and //
            (?:[a-z0-9_-]+\.)?              # Optional domain
            [a-z0-9_-]+                     # host
            (?:\.
                (?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)
            )+
            (?::[0-9]+)?                    # Optional port
            (?!\w)(?:\/(?:[^\s\.,]|[\.,][^\s\.,])+)*(?![^\.,]$)  # Optional relative URI
        )
        | ([A-Z][a-z]+(?=\.?(?:[A-Z][A-Za-z]|\d)+)) # Capture titles glued to digits or other words
        | ([A-Z][A-Za-z]+(?=\.?(?:[A-Z][a-z]|\d)+)) # Capture words glued to digits or other words
        | ([a-z]+(?=\.?(?:[A-Z]|\d)+))              # Capture lower words glued to digits or captalized words
        | (         # Capture telephones and CEPs
            (?:         # Asserts telephones
                (?:(?:\(?\ *)\d{2,3}(?:\ *\))?)?    # Gets the DDD
                (?:\ *9\ *(?:\.|-|\/|\\)?)?         # Optional ninth digit
                (?!(?:1|2)\d{3})        # Negative lookahead to prevent from getting years
                \d{4}(?:\.|-|\/|\\)?        # First 4 telephone digits with optional separator
                \d{4}                       # Last 4 digits
            ) | (?:     # Asserts CEPs
                \d{2}(?:\.|-|\/|\\)?    # First two digits, followed by an optional separator
                \d{3}(?:\.|-|\/|\\)?    # Following three digits, followed by an optional separator
                \d{3}                   # Last three digits
            )   # Since the CEPs regex gets some telephones as false positives
        )       # both regexes are in same group
        | (             # Capture values (as in currencies, percentage, measures...)
            (?<![\d\.\/\\-])        # Negative lookbehind for digits or separators
            (?:(?:R?\$|€)(?:\ )*)?  # Currencies symbols
            (?!(?:1|2)\d{3})        # Negative lookahead to prevent from getting years
            \d+                     # Proper digits
            (?:
                (?:\.|,)            # Punctuation
                (?!(?:1|2)\d{3})    # Negative lookahead to prevent from getting years
                \d+                 # After punctuation digits
            )*
            (?:%|\w{1,3}\b)?        # Percentage or measures abbreviations
            # (?![\d\.\/\\-])         # Negative lookahead for digits or separators TODO: Fix it by 15%15%9999999999911111 199999999999999 12-1999 janeiro/2000 09/9/2000
        )
        | (         # Date regex
            # (?<![\d])   # Negative lookbehind for digits
            (?:(?:0?[1-9]|[1-2][0-9]|3[0-1])(?!\d)(?:\.|-|\/|\\))?    # Asserts the first of three parts of a date (optional)
            (?:(?:[A-Za-z_]+|0?[1-9]|[1-2][0-9]|3[0-1])(?!\d)(?:\.|-|\/|\\))?   # Asserts the second part, can be either a word or one to two digits (optional)
            (?:(?:(?:1|2)\d{3})|[0-9]{2})(?!\d)                       # Asserts the year
        )
        | (     # Capture (glued) values (as in currencies, percentage, measures...)
            (?:(?:R?\$|€)(?:\ )*)?  # Currencies symbols
            \d+                     # Proper digits
            (?:(?:\.|,)\d+)*        # Punctuation
            (?:%|\w{1,3}\b)?        # Percentage or measures abbreviations
        )       # This second search aims to get values that were glued to digits or separators
        | ((?:\w+\.?)*(?:\w+))   # Words and abbreviations with optional : at the end
        | ([^A-Za-z0-9\ \n])    # Every thing that is not a letter, a digit, space or line break
    """

    @property
    def token_pattern(self):
        """
        Read-only property. This property holds its state across instances of WordTokenizer.
        """
        return self._token_pattern

    @property
    def stopwords(self):
        # TODO: Set self.remove_stopwords setter to also set self._stopwords considering the lang
        #       Also set the lang setter to change self._stopwords accordingly
        """
        Read-only property. Returns the list of stopwords if and only if
        self._remove_stopwords is True
        """
        if self.remove_stopwords:
            if self._stopwords is None:
                self._stopwords = nltk.corpus.stopwords.words(self.lang)
            return self._stopwords

        return None

    def __init__(self, lang, remove_stopwords=False, lower_case=False, do_stemming=False):
        self.lang = lang
        self.remove_stopwords = remove_stopwords
        self.lower_case = lower_case
        self.do_stemming = do_stemming
        self._stopwords = None

        if self.remove_stopwords:
            self._stopwords = nltk.corpus.stopwords.words(lang)
        if do_stemming:
            self.stemmer = nltk.stem.RSLPStemmer()

    def _extract_text(self, html):
        ## Regexes for html pages splitting
        #  Remove script tags and its content
        SCRIPT_TAG_REGEX = re.compile(r'<script.+?>(.|\n)+?</script>')
        STYLE_TAG_REGEX = re.compile(r'<style.+?>(.|\n)+?</style>')
        # Remove remaining tags, leaving content
        HTML_TAGS_REGEX = re.compile(r'<[^>]*>')

        return HTML_TAGS_REGEX.sub(' ', SCRIPT_TAG_REGEX.sub(' ', html))

    def _shave_marks(self, text):
        """
        Removes all diacritic marks from the given string
        """
        if text is None:
            return ''

        norm_text = unicodedata.normalize('NFD', text)
        shaved = ''.join(char for char in norm_text if not unicodedata.combining(char))
        return unicodedata.normalize('NFC', shaved)

    def _tag_tokens(self, document_tokens):
        typed_tokens = []
        for match_group in document_tokens:
            typed_group = []
            for index, match in enumerate(match_group):
                if match:
                    typed_group.append((ETokenType(index), match))

            # if typed_group:
            assert len(typed_group) > 0, "Token with no match, probably missing parenthesis on regex"
            assert len(typed_group) == 1, "Multiple matches for a single token %r" % ' '.join(match_group)
            typed_tokens.append(typed_group[0])

        return typed_tokens

    def tokenize(self, html, ignored_token_types=[], min_token_size=2):
        """
        Tokenize a string by: e-mail, url, date, glued words, values, abbreviations, words and
        every thing that isn't a letter, digit, blank space or line break.

        Returning only tokens of desirable types
        """

        # Extract text from html document
        text = self._extract_text(html)

        # Remove diacritcs
        shaved_text = self._shave_marks(text)
        
        # Returns an array where every position has a tuple with one position to
        # every regex on token_pattern
        document_tokens = nltk.regexp_tokenize(shaved_text, self._token_pattern)

        # Transform the array of tuples into another array of tuples where
        # the first position is the token_type and the second is the token itself
        document_tokens = self._tag_tokens(document_tokens)

        # Filter token types
        document_tokens = [token for token_type, token in document_tokens
                           if token_type not in ignored_token_types]

        if self.remove_stopwords:
            # Keeps tokens that has at least one captalized letter (even if is a stopword)
            # Since only lower case words test the second condition, there is no need to lower the token
            document_tokens = [token for token in document_tokens
                               if not token.islower() or not token in self._stopwords]

        if self.lower_case:
            document_tokens = [token.lower() for token in document_tokens]

        document_tokens = [token.strip() for token in document_tokens if len(token.strip()) >= min_token_size]

        if self.do_stemming:
            document_tokens = [self.stemmer.stem(token) for token in document_tokens]
        
        return document_tokens



In [10]:
dt = joblib.load('../classifier/decisiontree.joblib')
vectorizer = joblib.load('../classifier/vectorizer.joblib')
selector = joblib.load('../classifier/featureselector.joblib')



In [29]:
corpus = read_corpus(
    './pages/heuristic/Pocilga/',
    'portuguese',
    ignored_token_types=[
        ETokenType.EMAIL,
        ETokenType.URL,
        ETokenType.TELEPHONE_CEP,
        ETokenType.DATE,
        ETokenType.NON_WORD
    ],
    min_token_size=2
)

# Separate ids from documents
ids, documents = zip(*[(id_, ' '.join(document)) for id_, document in corpus.items()])

labels = []
for id_ in ids:
    labels.append(id_.split('_')[0])

labels = np.array(labels)
ids = np.array(ids)
documents = np.array(documents)

Error reading file: page_38.html
Error reading file: page_35.html
Error reading file: page_1.html
Corpus loaded, document count: 204


In [30]:
lista = []
cont = 0
for c in documents:
    vec = vectorizer.transform([c])
    new_vec = selector.transform(vec)
    x = dt.predict(new_vec)
    if(x[0] == 'positive'):
        cont = cont + 1
print(cont)

177
