In [1]:
# Reading PDF files and basic NLP
# import PyPDF2
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
import io

import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Fuzzy string match
from fuzzywuzzy import fuzz

# To generate word clouds
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inline

# Read unicode data from the extra_stop_words file
import unicodedata
import re
import os

# Frequency counting and collections
import numpy as np
import collections

# To navigate through all files in a directory
from os import listdir
from os.path import isfile, join

## 1. Define functions for the text extraction, preprocessing and analysis

In [2]:
# Adapted from https://gist.github.com/boniattirodrigo/67429ada53b7337d2e79
def remove_special_characters(word, only_numbers_letters_spaces=False):

    # Unicode normalize transforma um caracter em seu equivalente em latin.
    nfkd = unicodedata.normalize('NFKD', word)
    plain_word = u"".join([c for c in nfkd if not unicodedata.combining(c)])

    # return the word with only numbers, letters and spaces
    if only_numbers_letters_spaces:
        return re.sub('[^a-zA-Z0-9 \\\]', '', plain_word)
    return plain_word

In [3]:
# Adapted from https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
# Updated the keywords removal 
def extract_keywords(path_pdf,
                     pdf_file,
                     path_output_raw_text,
                     language,#='portuguese', 
                     path_extra_stop_words,#='/Users/hmg/Dropbox/veve e heitor/Projeto_tese',
                     file_extra_stop_words#='extra_stop_words.txt',
                    ): 
    # The method used to transform the pdf to text ('pdfminer.six' or 'textract_OCR')
    method = ''
    # Check if the raw text is not already availabe in the path_output_raw_text
    text = ''
    try:
        candidate_raw = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'r')
        text = candidate_raw.read()
#         print('successfully read the raw text! ')
    except IOError:
        
#         if use_pyPDF:
#     #         print('gotta scan the pdf...')
#             # open allows you to read the file
#             pdfFileObj = open(os.path.join(path_pdf, pdf_file), 'rb')
#             # The pdfReader variable is a readable object that will be parsed
#             pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#             # discerning the number of pages will allow us to parse through all the pages
#             num_pages = pdfReader.numPages
#             # Just to show info about creator and creation time. 
#             # print(str(pdfReader.getDocumentInfo())
#             count = 0

#             # The while loop will read each page
#             while count < num_pages:
#                 pageObj = pdfReader.getPage(count)
#                 count += 1
#                 text += pageObj.extractText()
#             # This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned
#             # files.
            
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(os.path.join(path_pdf, pdf_file), 'rb') as candidate_raw:
            for page in PDFPage.get_pages(candidate_raw, 
                                          caching=True,
                                          check_extractable=True):
                page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()

        # close open handles
        converter.close()
        fake_file_handle.close()
        
        if text:
            method = 'pdfminer.six'
            text = text
        # If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
        else: # fileurl
            method = 'textract_OCR'
#             print('Activating OCR library for {0} file (language = {1})'.format(pdf_file,language))
            if language == 'portuguese':
                text = textract.process(os.path.join(path_pdf, pdf_file), method='tesseract', language='por', encoding='utf8')
            else:
                text = textract.process(os.path.join(path_pdf, pdf_file), method='tesseract', language='eng', encoding='utf8')

        file_raw_text = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'w', encoding='utf8')
        file_raw_text.write(text)
    
    text = remove_special_characters(text)
    # The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    # We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN
    # punctuations.
    keywords = [word.lower() for word in tokens 
                if 
                not word.lower() in stop_words and 
                not word.lower() in punctuations and 
                not word.lower() in extra_stop_words and 
                word.isalpha() and
                len(word) >= 2]
    
    return np.asarray(keywords), method

In [4]:
def word_cloud_from_keywords_frequency(keywords_frequency, file_name, path='./wordclouds/', show_image=False):
    wordcloud = WordCloud(width = 512, height = 512, background_color='white')
    fig = plt.figure(figsize=(20,16),facecolor = 'white', edgecolor='blue')
    plt.imshow(wordcloud.generate_from_frequencies(keywords_frequency), interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad=0)
    
    if show_image:
        plt.show()
    plt.savefig(path+file_name)
    plt.close(fig)

In [5]:
def calculate_statistics(keywords):
    total_words = 0
    keywords_dict = dict(collections.Counter(keywords).most_common())
    for k in keywords_dict:
        total_words += keywords_dict[k]

    stats = dict()
    for k in keywords_dict:
        stats[k] = {'count': keywords_dict[k], 'text_frequency': keywords_dict[k]/float(total_words)}    
#     stats['frequency'].most_common(10)
    
    return stats

In [6]:
def write_report(path, output_file, keyword_stats):
    output = open(path+output_file, 'w', encoding='utf8')
    header = u'word,count,text_frequency\n'
    output.write(header)
    for k in keyword_stats:
        line = u'%s,%d,%.5f\n' % (k, keyword_stats[k]['count'], keyword_stats[k]['text_frequency'])
        output.write(line)
    output.close()

## 2. Count occurrences of any word

* All PDFs in a given path are processed.
* This function generates 'raw reports', counting the occurrence and frequency of every word of the document. 
* A word is defined as a sequence of characters delimited by empty spaces, i.e. ' '. 
* **TODO:** This version of the code still doesn't handle pdfs that are in PT and EN.

In [7]:
def process_PDFs(path='/Users/hmg/Desktop/data/PDF_relatorios_sustentabilidade/', 
                     raw_text_path='./raw_texts/EN/',
                     output_path='./reports/EN/', 
                     wordclouds_path='./wordclouds/EN/',
                     language='english',
                     path_extra_stop_words='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/',
                     file_extra_stop_words='extra_stop_words_EN.txt',
                     wordclouds=False, 
                     start_index=0):
    
    num_pdfminersix = 0
    num_textract_OCR = 0
    # We assume there are only pdfs in this directory
    PDFs = [f for f in listdir(path) if isfile(join(path, f))]
    PDFs = list(filter(lambda pdf: pdf.find('.pdf') != -1, PDFs))
        
    counter = start_index
    for pdf in PDFs[counter:]:
        
        print('Processing PDFs ({0}) {1} of {2} ({3})'.format(language, counter+1, len(PDFs), pdf))
        keywords, method = extract_keywords(path, pdf, 
                                    path_output_raw_text=raw_text_path, 
                                    language=language,
                                    path_extra_stop_words=path_extra_stop_words,
                                    file_extra_stop_words=file_extra_stop_words)
        if method == 'pdfminer.six':
            num_pdfminersix += 1
        else:
            num_textract_OCR += 1
        keywords_statistics = calculate_statistics(keywords)
        write_report(output_path, pdf.replace('.pdf', '.csv'), keywords_statistics)
        counter = counter + 1
        if wordclouds:
            word_cloud_from_keywords_frequency(collections.Counter(keywords), 
                                               pdf.replace('.pdf', '.png'), 
                                               path=wordclouds_path,
                                               show_image=False)
    print('Stats for {0}: num_pdfminersix = {1}, num_textract_OCR = {2}'
          .format(raw_text_path, num_pdfminersix, num_textract_OCR))

In [19]:
%%time
# # ENglish PDFs processing
# process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_EN/', 
#              raw_text_path='./raw_texts/EN/',
#              output_path='./reports/EN/', 
#              wordclouds_path='./wordclouds/EN/',
#              language='english',
#              wordclouds=True,
#             start_index = 0)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 9.78 µs


In [20]:
%%time
# # PorTuguese PDFs processing
# for i in range(1,5): # Hardcoded 5 because I know there are only 4 directories... 
#     print('Processing pdfs_PT{0}'.format(i))
#     process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_PT{0}/'.format(i), 
#              raw_text_path='./raw_texts/PT/',
#              output_path='./reports/PT/', 
#              wordclouds_path='./wordclouds/PT/',
#              language='portuguese',
#              file_extra_stop_words='extra_stop_words_PT.txt',
#              wordclouds=True,
#              start_index=0)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11.9 µs


## EXPERIMENTAL : <span style="color:red">trying 'por' in tesseract-ocr</span>

In [13]:
# %%time
# process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_PT_dev/', 
#              raw_text_path='./raw_texts/PT_dev/',
#              output_path='./reports/PT_dev/', 
#              wordclouds_path='./wordclouds/PT_dev/',
#              language='portuguese',
#              file_extra_stop_words='extra_stop_words_PT.txt',
#              wordclouds=True,
#              start_index=0)

Processing PDFs (portuguese) 1 of 1 (RIO_35_Non_2014.pdf)
Extracted using pdfminer.six ...
CPU times: user 18.7 s, sys: 202 ms, total: 18.9 s
Wall time: 19.1 s


In [12]:
# %%time
# process_PDFs(path='/Users/hmg/Desktop/Data/pdfs_PT_dev/', 
#              raw_text_path='./raw_texts/PT_dev/',
#              output_path='./reports/PT_dev/', 
#              wordclouds_path='./wordclouds/PT_dev/',
#              language='english',
#              file_extra_stop_words='extra_stop_words_PT.txt',
#              wordclouds=True,
#              start_index=0)

Processing PDFs (english) 1 of 1 (RIO_35_Non_2014.pdf)


## 3. Count occurrences of predefined phrases/words

**IN-DEVELOPMENT:** Count occurrences of predefined text using 'fuzzy string match', added ```word_match_count``` lambda object. 

In [14]:
# def count_total_words(text, delim=' '): 
#     return len(remove_special_characters(text).split(delim))

In [8]:
def count_normal(text, word, confidence = 0, phrases = False, debug = False, output = ''):
    if phrases:
        return text.count(word)
    else:
        counter = 0
        for p in text.split(' '):
            if(p == word):
                counter = counter + 1
        return counter

def count_fuzzy(text, word, confidence = 95, phrases=False, debug = False, output = ''):
    counter = 0
    if len(word) == 0:
        return 0
    # Phrases
    if phrases:
        # Sliding window strategy: create a text_word from text with length word.len 
        #  and by moving 1 character at a time
        #     If match, then skip the next word.len, just to avoid double counting!
        i = 0
        ## DEBUG
#         print('count_fuzzy - word = {0} and, len(text) = {1}, confidence = {2}'.format(word, len(text), confidence))
        if debug:
            output.write('{0} ({1})'.format(word, confidence) + ',')
#             print('{0} ({1})'.format(word, confidence))
    
        while i < len(text):
            text_word_last_idx = i+len(word)
            match_confidence = 0
            if text_word_last_idx <= len(text):
                match_confidence = fuzz.ratio(text[i:text_word_last_idx], word)
#                 print('token_set_ratio({0},{1})={2}'.format(text[i:text_word_last_idx], word, match_confidence))
                if match_confidence > confidence:
                    counter += 1
                    if debug:
                        output.write('{0} ({1})'.format(text[i:text_word_last_idx], match_confidence) + ',')
#                         print('{0} =~ {1} ({2})'.format(text[i:text_word_last_idx], word, match_confidence))
                    i += len(word)
                else:
                    i += 1
            else:
                break
            ## DEBUG - only about 100 characters
#             if debug and (i % int(len(text)/10) == 0):
#                 print('{} of {} = {:.2f}%'.format(i, len(text), i/len(text)*100))
    else:
        text_vec = text.split()
        if debug:
            output.write('{0} ({1})'.format(word, confidence) + ',')
        for w in text_vec:
            match_confidence = fuzz.ratio(w, word)
            if match_confidence > confidence:
                if debug:
                    output.write('{0} ({1})'.format(w, match_confidence) + ',')
                counter += 1
    if debug:
        output.write('\n')
    return counter

In [21]:
def filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='output_EN.csv',
                            language='english',
                            path_extra_stop_words='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/',
                            file_extra_stop_words='extra_stop_words_EN.txt',
                            word_match_count=count_normal,
                            output_encoding='utf8',
                            confidence=95,
                            phrases=False,
                            output_debug_path='./debug_fuzzy/EN/',
                            debug=False,
                            start_index=0,
                            last_index=-1,
                            preprocessing=False):
    print('Starting filter_special_keywords')
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    if not phrases:
        print('[not phrases] Stopwords will be removed prior to processing the texts.')
        
    
    filter_words = ''
    try:
        filter_words = open(os.path.join(filter_list_path, filter_list_file), 'r', encoding='utf8').read().split('\n')
        print('successfully read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    except IOError:
        print('failed to read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    
    # assuming there are only the raw texts in the directory
    text_files = [f for f in listdir(raw_text_path) if isfile(join(raw_text_path, f))]
    text_files = list(filter(lambda text_file: text_file.find('.txt') != -1, text_files))
    
    stats_per_file = {}
    total_words_per_file = {}
    
    processed = start_index
    if last_index == -1:
        last_index = len(text_files)
    
    for text_file in text_files[processed:last_index]:
        print('Processing Text ({0}) {1} of {2} ({3})'.format(language, processed+1, 
                                                              last_index, text_file))
        debug_output = open(output_debug_path+'DEBUG_{0}_'.format(confidence)+text_file.replace('.txt','.csv'), 'w', encoding='utf8') 
        try:
            text = open(os.path.join(raw_text_path, text_file), 'r').read()
        except IOError:
            print('Failed to open file at {0} named {1}'.format(raw_text_path, text_file))
                
        # Initialize the stats dictionary that will hold counters for the search words {'word': #occurrences}
        stats = {}
        
        # Transform the original text, transform from unicode to ASCII (DOES NOT REMOVE SPECIAL) and set it to lower.
        search_text = remove_special_characters(text).lower()
        
#         if preprocessing: PREPROCESSING WILL CERTAINLY BE USED FOR COUNTING NUMBER OF WORDS!!
        # This remove stop_words, extra_words and punctuations before counting. 
        # TODO: merge this and the previous stop_word filter in a function. 
        search_text_nostopwords = [word for word in search_text.split(' ')
                               if 
                               not word in stop_words and
                               not word in punctuations and 
                               not word in extra_stop_words and
                               len(word) >= 1]
        
        if preprocessing:
            # If NOT phrases, then it is safe to remove stop words from the search_text.
            # That is an assumption! Maybe we run into a search word that is a stop word!
            if not phrases:
                search_text = ' '.join(search_text_nostopwords)
                
        # Update to ALWAYS count as total words after removing stop_words. 
        stats['@TOTAL_WORDS'] = len(search_text_nostopwords)
#         else:
#             stats['@TOTAL_WORDS'] = len(search_text.split())
        
        for filter_word in filter_words:
            # Transform the filter_word, transform from unicode to ASCII (DOES NOT REMOVE SPECIAL) and set it to lower.
            search_word = remove_special_characters(filter_word).lower()
            stats[filter_word] = word_match_count(search_text, search_word, confidence, phrases, debug, debug_output)
            # text.lower().count(filter_word.lower())
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)

        stats_per_file[text_file.replace('.txt','')] = stats
        
        processed = processed + 1
        
    output = open(output_path+output_file, 'w', encoding=output_encoding)   
    header = u',' + u','.join(text_files[start_index:last_index])+u'\n'
    header = header + u'words \ total_words_per_file,' + u','.join(total_words_per_file)
    output.write(header)
    
    for text_file in text_files[start_index:last_index]:
        output.write(str(stats_per_file[text_file.replace('.txt','')]['@TOTAL_WORDS']) + ',')
    output.write('\n')
    
    for filter_word in filter_words:
        output.write(filter_word + ',')
        for text_file in text_files[start_index:last_index]:
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)
            output.write(str(stats_per_file[text_file.replace('.txt','')][filter_word]) + ',')
        output.write('\n')
    output.close()
    print('Finishing filter_special_keywords')

# ~ + ~ + ~ + ~ + ~ + ~ + ~
# Filter the special words
# ~ + ~ + ~ + ~ + ~ + ~ + ~

## 3.1 EN 'exact match'


### <span style="color:blue">PREPROCESSING=OFF</span>
### <span style="color:blue">EXACT MATCH</span>

In [25]:
%%time
filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='relatorio_normal_frases_EN.csv',
                            word_match_count=count_normal,
                            confidence=-1,
                            phrases=True,
                            preprocessing=False
                            )

Starting filter_special_keywords
successfully read the filter words at /Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/ named lista_palavras_EN.txt
Processing Text (english) 1 of 209 (CPFL_RE_10_2014_EN.txt)
Processing Text (english) 2 of 209 (ESTC_11_2015_EN.txt)
Processing Text (english) 3 of 209 (BUN_8_Non_2012_EN.txt)
Processing Text (english) 4 of 209 (EMBR_19_2012_EN.txt)
Processing Text (english) 5 of 209 (LREN_17_2015_EN.txt)
Processing Text (english) 6 of 209 (Syngenta_24_2015_EN.txt)
Processing Text (english) 7 of 209 (TCSA_21_2017_EN.txt)
Processing Text (english) 8 of 209 (ALI_14_Non_2015_EN.txt)
Processing Text (english) 9 of 209 (END_2_Non_2011_EN.txt)
Processing Text (english) 10 of 209 (FIBR_7_2014_EN.txt)
Processing Text (english) 11 of 209 (SANB_14_2011_EN.txt)
Processing Text (english) 12 of 209 (IMA_33_Non_2013_EN.txt)
Processing Text (english) 13 of 209 (CIEL_14_2016_EN.txt)
Processing Text (english) 14 of 209 (INP_24_Non_2013_EN.txt)
Processing Text (english) 1

Processing Text (english) 138 of 209 (HSB_14_Non_2013_EN.txt)
Processing Text (english) 139 of 209 (Syngenta_24_2016_EN.txt)
Processing Text (english) 140 of 209 (FDC_11_Non_2017_EN.txt)
Processing Text (english) 141 of 209 (ITA_10_Non_2014_EN.txt)
Processing Text (english) 142 of 209 (BBDC_14_2012.txt)
Processing Text (english) 143 of 209 (NATU_15_2016_EN.txt)
Processing Text (english) 144 of 209 (LAME_25_2017_EN.txt)
Processing Text (english) 145 of 209 (TIET ELE_2_2015_EN.txt)
Processing Text (english) 146 of 209 (KLBN_7_2017_EN.txt)
Processing Text (english) 147 of 209 (B3SA_14_2015_EN.txt)
Processing Text (english) 148 of 209 (ELE_10_Non_2014_EN.txt)
Processing Text (english) 149 of 209 (INV_12_Non_2015_EN.txt)
Processing Text (english) 150 of 209 (BBAS_14_2015_EN.txt)
Processing Text (english) 151 of 209 (TET_6_Non_2018_EN.txt)
Processing Text (english) 152 of 209 (AMA_12_Non_2016_EN.txt)
Processing Text (english) 153 of 209 (VIM_1_Non_2015_EN.txt)
Processing Text (english) 154 o

In [27]:

# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN_dev/',
#                             output_path='./reports_special/EN_dev/',
#                             output_file='relatorio_normal_frases_EN_dev.csv',
#                             word_match_count=count_normal,
#                             confidence=-1,
#                             phrases=True,
#                             preprocessing=False
#                             )

### 3.1.1 EN <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (90%)</span>

In [24]:
%%time
CONFIDENCE = 90
START_INDEX = 0
LAST_INDEX = -1

filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='relatorio_aproximado({0})_frases_EN_{1}-{2}.csv'
                                        .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
                            word_match_count=count_fuzzy,
                            confidence=CONFIDENCE,
                            phrases=True,
                            preprocessing=False,
                        
                            output_debug_path='./debug_fuzzy/EN/',
                            debug=True,
                            start_index=START_INDEX,
                            last_index=LAST_INDEX
                            )

Starting filter_special_keywords
successfully read the filter words at /Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/ named lista_palavras_EN.txt
Processing Text (english) 1 of 4 (ITUB_14_2015_EN.txt)
Processing Text (english) 2 of 4 (V_M_3_Non_2011_EN.txt)
Processing Text (english) 3 of 4 (BBAS_14_2012_EN.txt)
Processing Text (english) 4 of 4 (ALLL_5_2014_EN.txt)
Finishing filter_special_keywords
CPU times: user 5min 34s, sys: 312 ms, total: 5min 34s
Wall time: 5min 35s


### 3.1.2 EN <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (95%)</span>

In [None]:
%%time
CONFIDENCE = 95
START_INDEX = 0
LAST_INDEX = -1

filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='relatorio_aproximado({0})_frases_EN_{1}-{2}.csv'
                                        .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
                            word_match_count=count_fuzzy,
                            confidence=CONFIDENCE,
                            phrases=True,
                            preprocessing=False,
                        
                            output_debug_path='./debug_fuzzy/EN/',
                            debug=True,
                            start_index=START_INDEX,
                            last_index=LAST_INDEX
                            )

## 3.2 PT 'exact match'


### <span style="color:blue">PREPROCESSING=OFF</span>
### <span style="color:blue">EXACT MATCH</span>

In [26]:
%%time
filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_PT.txt', 
                            raw_text_path='./raw_texts/PT/',
                            output_path='./reports_special/PT/',
                            output_file='relatorio_normal_frases_PT.csv',
                            word_match_count=count_normal,
                            confidence=-1,
                            phrases=True,
                            preprocessing=False
                            )

Starting filter_special_keywords
successfully read the filter words at /Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/ named lista_palavras_PT.txt
Processing Text (english) 1 of 856 (DAYC4_14_2015.txt)
Processing Text (english) 2 of 856 (ARZZ_12_2012.txt)
Processing Text (english) 3 of 856 (PLA_12_Non_2013.txt)
Processing Text (english) 4 of 856 (IDVL_14_2013.txt)
Processing Text (english) 5 of 856 (BRFS_8_2016.txt)
Processing Text (english) 6 of 856 (VOK_29_Non_2014.txt)
Processing Text (english) 7 of 856 (APO_13_Non_2011.txt)
Processing Text (english) 8 of 856 (BAU_12_Non_2016.txt)
Processing Text (english) 9 of 856 (INS_21_Non_2012.txt)
Processing Text (english) 10 of 856 (BOBR_15_2015.txt)
Processing Text (english) 11 of 856 (NEOE_10_2013.txt)
Processing Text (english) 12 of 856 (BRA_8_Non_2016.txt)
Processing Text (english) 13 of 856 (ELET_2_2015.txt)
Processing Text (english) 14 of 856 (INT_7_Non_2017.txt)
Processing Text (english) 15 of 856 (PHI_36_Non_2012.txt)
Processing T

Processing Text (english) 143 of 856 (Radiante_4_2015.txt)
Processing Text (english) 144 of 856 (IPM_12_Non_2013.txt)
Processing Text (english) 145 of 856 (TRPL_10_2017.txt)
Processing Text (english) 146 of 856 (CEDO_17_2011.txt)
Processing Text (english) 147 of 856 (GRU_10_Non_2013.txt)
Processing Text (english) 148 of 856 (INV_12_Non_2014.txt)
Processing Text (english) 149 of 856 (GRR_33_Non_2014.txt)
Processing Text (english) 150 of 856 (Malwee_17_2017.txt)
Processing Text (english) 151 of 856 (ELET_2_2012.txt)
Processing Text (english) 152 of 856 (ROC_28_Non_2017.txt)
Processing Text (english) 153 of 856 (INS_21_Non_2015.txt)
Processing Text (english) 154 of 856 (NEOE_10_2014.txt)
Processing Text (english) 155 of 856 (USA_15_Non_2014.txt)
Processing Text (english) 156 of 856 (SUZ_7_Non_2015.txt)
Processing Text (english) 157 of 856 (VOL_29_Non_2017.txt)
Processing Text (english) 158 of 856 (BUN_8_Non_2017.txt)
Processing Text (english) 159 of 856 (FIBR_7_2015.txt)
Processing Text (

Processing Text (english) 286 of 856 (VOT_20_Non_2014.txt)
Processing Text (english) 287 of 856 (CAI_14_Non_2015.txt)
Processing Text (english) 288 of 856 (MET_5_Non_2017.txt)
Processing Text (english) 289 of 856 (CEL_10_Non_2015.txt)
Processing Text (english) 290 of 856 (ELE_2_Non_2014.txt)
Processing Text (english) 291 of 856 (BTOW_13_2016.txt)
Processing Text (english) 292 of 856 (MRVE_21_2014.txt)
Processing Text (english) 293 of 856 (GAU_28_Non_2017.txt)
Processing Text (english) 294 of 856 (EKTR_10_2011.txt)
Processing Text (english) 295 of 856 (UNI_11_Non_2017.txt)
Processing Text (english) 296 of 856 (MRVE_21_2015.txt)
Processing Text (english) 297 of 856 (JBSS_8_2015.txt)
Processing Text (english) 298 of 856 (OUR_28_Non_2018.txt)
Processing Text (english) 299 of 856 (BTOW_13_2017.txt)
Processing Text (english) 300 of 856 (VIVT_4_2011.txt)
Processing Text (english) 301 of 856 (MET_5_Non_2016.txt)
Processing Text (english) 302 of 856 (CEL_10_Non_2014.txt)
Processing Text (englis

Processing Text (english) 430 of 856 (ARMT_3_2017.txt)
Processing Text (english) 431 of 856 (LUP_17_Non_2013.txt)
Processing Text (english) 432 of 856 (USIM_3_2011.txt)
Processing Text (english) 433 of 856 (BBDC_14_2015.txt)
Processing Text (english) 434 of 856 (AES_2_Non_2013.txt)
Processing Text (english) 435 of 856 (END_2_Non_2012.txt)
Processing Text (english) 436 of 856 (SAF_28_Non_2015.txt)
Processing Text (english) 437 of 856 (SOL_1_Non_2016.txt)
Processing Text (english) 438 of 856 (EGIE_10_2016.txt)
Processing Text (english) 439 of 856 (V_M_3_Non_2015.txt)
Processing Text (english) 440 of 856 (CAIS_14_Non_2018.txt)
Processing Text (english) 441 of 856 (DUKB_10_2017.txt)
Processing Text (english) 442 of 856 (MEX_20_Non_2012.txt)
Processing Text (english) 443 of 856 (ALI_14_Non_2013.txt)
Processing Text (english) 444 of 856 (GRU_8_Non_2016.txt)
Processing Text (english) 445 of 856 (SAM_26_Non_2011.txt)
Processing Text (english) 446 of 856 (ECOR_12_2013.txt)
Processing Text (engl

Processing Text (english) 575 of 856 (FLRY_9_2011.txt)
Processing Text (english) 576 of 856 (FUN_34_Non_2014.txt)
Processing Text (english) 577 of 856 (ANG_26_Non_2011.txt)
Processing Text (english) 578 of 856 (JOH_36_Non_2012.txt)
Processing Text (english) 579 of 856 (FLRY_9_2013.txt)
Processing Text (english) 580 of 856 (HOS_9_Non_2013.txt)
Processing Text (english) 581 of 856 (TIMP_4_2017.txt)
Processing Text (english) 582 of 856 (USI_8_Non_2017.txt)
Processing Text (english) 583 of 856 (Redecard_14_2011.txt)
Processing Text (english) 584 of 856 (INS_29_Non_2015.txt)
Processing Text (english) 585 of 856 (UNI_9_Non_2017.txt)
Processing Text (english) 586 of 856 (CLSC_10_2013.txt)
Processing Text (english) 587 of 856 (Embasa_18_2016.txt)
Processing Text (english) 588 of 856 (ARMT_3_2012.txt)
Processing Text (english) 589 of 856 (CSAN_12_2015.txt)
Processing Text (english) 590 of 856 (HGTX_17_2016.txt)
Processing Text (english) 591 of 856 (ALGT_4_2017.txt)
Processing Text (english) 592

Processing Text (english) 718 of 856 (SAC_26_Non_2017.txt)
Processing Text (english) 719 of 856 (OIBR_4_2014.txt)
Processing Text (english) 720 of 856 (SIS_32_Non_2015.txt)
Processing Text (english) 721 of 856 (FHER_16_2014.txt)
Processing Text (english) 722 of 856 (SIR_9_Non_2015.txt)
Processing Text (english) 723 of 856 (BIO_24_Non_2015.txt)
Processing Text (english) 724 of 856 (APX_13_Non_2013.txt)
Processing Text (english) 725 of 856 (PRE_14_Non_2017.txt)
Processing Text (english) 726 of 856 (COS_2_Non_2014.txt)
Processing Text (english) 727 of 856 (GRU_14_Non_2017.txt)
Processing Text (english) 728 of 856 (INS_14_Non_2013.txt)
Processing Text (english) 729 of 856 (MER_29_Non_2012.txt)
Processing Text (english) 730 of 856 (CEEB_10_2017.txt)
Processing Text (english) 731 of 856 (KLBN_7_2016.txt)
Processing Text (english) 732 of 856 (EVEN_21_2016.txt)
Processing Text (english) 733 of 856 (ELEK_16_2014.txt)
Processing Text (english) 734 of 856 (KIM_15_Non_2013.txt)
Processing Text (en

### 3.2.1 PT <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (90%)</span>

In [None]:
%%time
CONFIDENCE = 90
START_INDEX = 0
LAST_INDEX = -1

filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_PT.txt', 
                            raw_text_path='./raw_texts/PT/',
                            output_path='./reports_special/PT/',
                            output_file='relatorio_aproximado({0})_frases_PT_{1}-{2}.csv'
                                        .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
                            word_match_count=count_fuzzy,
                            confidence=CONFIDENCE,
                            phrases=True,
                            preprocessing=False,
                        
                            output_debug_path='./debug_fuzzy/PT/',
                            debug=True,
                            start_index=START_INDEX,
                            last_index=LAST_INDEX
                            )

### 3.2.2 PT <span style="color:red">'fuzzy match'</span>


#### PREPROCESSING=OFF
# <span style="color:red">FUZZY MATCH (95%)</span>

In [None]:
%%time
CONFIDENCE = 95
START_INDEX = 0
LAST_INDEX = -1

filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
                            filter_list_file='lista_palavras_PT.txt', 
                            raw_text_path='./raw_texts/PT/',
                            output_path='./reports_special/PT/',
                            output_file='relatorio_aproximado({0})_frases_PT_{1}-{2}.csv'
                                        .format(CONFIDENCE, START_INDEX, str(LAST_INDEX) if LAST_INDEX != -1 else 'end'),
                            word_match_count=count_fuzzy,
                            confidence=CONFIDENCE,
                            phrases=True,
                            preprocessing=False,
                        
                            output_debug_path='./debug_fuzzy/PT/',
                            debug=True,
                            start_index=START_INDEX,
                            last_index=LAST_INDEX
                            )

## Sandbox

In [16]:
# keywords = extract_keywords('/Users/hmg/Downloads/', '3M_RS2014.pdf', '/Users/hmg/Downloads/')
# keywords_frequency = collections.Counter(keywords)
# 
# keywords_frequency.most_common(10)

In [15]:
# keywords = extract_keywords('/Users/hmg/Desktop/data/PDF_relatorios_sustentabilidade/FIBR_7_2012_EN.pdf', 
#                             language='english')
# keywords_frequency = collections.Counter(keywords)

# word_cloud_from_keywords_frequency(keywords_frequency, file_name='hey.png', plot=False)

# keywords_frequency.most_common(10)

In [None]:
# calculate_statistics(keywords)

In [17]:
# Tests with fuzzy string match
# fuzz.ratio(str.lower(u'Casarão'), str.lower(u'Caserão'))
# fuzz.token_set_ratio(str.lower(u'Casarão'), str.lower(u'Casârão'))

# normal_count = count_normal("mama mia this is a mama and there is no mia in this mama", "mãma")
# fuzzy_count = count_fuzzy("mama mia this is a mama and there is no mia in this mama", "mãma", 80)

# print('normal = {0}, fuzzy = {1}'.format(normal_count, fuzzy_count))

In [18]:
# fuzz.ratio(str.lower(u'vegeta“‰o'), str.lower(u'vegetacao'))
# fuzz.token_set_ratio(str.lower(u'EN-13'), str.lower(u'EN-'))

In [19]:
# palavra = 'joão-victor~2'
# print(remove_special_characters(palavra))

In [None]:
## 3.2.1 <span style="color:red">Experimental setting</span>

In [None]:
# %%time
#########################################################################
#########################################################################
################## PREPROCESSING ON!!! EXPERIMENTAL!!! ##################
#########################################################################
#########################################################################

# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
#                             filter_list_file='lista_palavras_PT.txt', 
#                             raw_text_path='./raw_texts/PT_dev/',
#                             output_path='./reports_special/PT_dev/',
#                             output_file='relatorio_normal_frases_PT_preON_website_converter.csv',
#                             word_match_count=count_normal,
#                             confidence=-1,
#                             phrases=True,
#                             preprocessing=True
#                             )

In [None]:
# %%time
# #########################################################################
# #########################################################################
# #################### PREPROCESSING ON!!! UNSAFE!!! ######################
# #########################################################################
# #########################################################################
# ############################# APROXIMADO 95% ############################

# CONFIDENCE = 95

# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
#                             filter_list_file='lista_palavras_PT.txt', 
#                             raw_text_path='./raw_texts/PT_dev/',
#                             output_path='./reports_special/PT_dev/',
#                             output_file='relatorio_aproximado({0})_frases_PT_preON.csv'.format(CONFIDENCE),
#                             word_match_count=count_fuzzy,
#                             confidence=CONFIDENCE,
#                             phrases=True,
#                             preprocessing=True
#                             )

In [None]:
# %%time
# #########################################################################
# #########################################################################
# ####################### PREPROCESSING OFF!!!!!! #########################
# #########################################################################
# #########################################################################
# ############################# APROXIMADO 95% ############################
# CONFIDENCE = 95

# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Final_Dissertacao/', 
#                             filter_list_file='lista_palavras_PT.txt', 
#                             raw_text_path='./raw_texts/PT_dev/',
#                             output_path='./reports_special/PT_dev/',
#                             output_file='relatorio_aproximado({0})_frases_PT_preOFF.csv'.format(CONFIDENCE),
#                             word_match_count=count_fuzzy,
#                             confidence=CONFIDENCE,
#                             phrases=True,
#                             preprocessing=False
#                             )