In [1]:
# Reading PDF files and basic NLP
import PyPDF2
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Fuzzy string match
from fuzzywuzzy import fuzz

# To generate word clouds
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inline

# Read unicode data from the extra_stop_words file
import unicodedata
import re
import os

# Frequency counting and collections
import numpy as np
import collections

# To navigate through all files in a directory
from os import listdir
from os.path import isfile, join

## 1. Define functions for the text extraction, preprocessing and analysis

In [2]:
# Adapted from https://gist.github.com/boniattirodrigo/67429ada53b7337d2e79
def remove_special_characters(word):

    # Unicode normalize transforma um caracter em seu equivalente em latin.
    nfkd = unicodedata.normalize('NFKD', word)
    plain_word = u"".join([c for c in nfkd if not unicodedata.combining(c)])

    # return the word with only numbers, letters and spaces
    return re.sub('[^a-zA-Z0-9 \\\]', '', plain_word)

In [3]:
# Adapted from https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
def extract_keywords(path_pdf,
                     pdf_file,
                     path_output_raw_text,
                     stop_words_language='portuguese', 
                     path_extra_stop_words='/Users/hmg/Dropbox/veve e heitor/Projeto_tese',
                     file_extra_stop_words='extra_stop_words.txt'):
    
    # Check if the raw text is not already availabe in the path_output_raw_text
    text = ''
    try:
        candidate_raw = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'r')
        text = candidate_raw.read()
#         print('successfully read the raw text! ')
    except IOError:
#         print('gotta scan the pdf...')
        # open allows you to read the file
        pdfFileObj = open(os.path.join(path_pdf, pdf_file), 'rb')
        # The pdfReader variable is a readable object that will be parsed
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        # discerning the number of pages will allow us to parse through all the pages
        num_pages = pdfReader.numPages
        # Just to show info about creator and creation time. 
        # print(str(pdfReader.getDocumentInfo())
        count = 0

        # The while loop will read each page
        while count < num_pages:
            pageObj = pdfReader.getPage(count)
            count += 1
            text += pageObj.extractText()
        # This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned
        # files.
        if text != "":
            text = text
        # If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
        else: # fileurl
            text = textract.process(os.path.join(path_pdf, pdf_file), method='tesseract', language='eng', encoding='utf8')

        file_raw_text = open(os.path.join(path_output_raw_text, pdf_file.replace('.pdf', '.txt')), 'w', encoding='utf8')
        file_raw_text.write(text)
    
#     text = remove_special_characters(text)
    # The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(text)
    punctuations = ['(', ')', ';', ':', '[', ']', ',', '%', '-', '.', '|', '']
    stop_words = stopwords.words(stop_words_language)
    extra_stop_words = open(os.path.join(path_extra_stop_words, file_extra_stop_words), 'r', encoding='utf8').read().split('\n')
    
    # We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN
    # punctuations.
    keywords = [word.lower() for word in tokens 
                if 
                not word.lower() in stop_words and 
                not word.lower() in punctuations and 
                not word.lower() in extra_stop_words and 
                word.isalpha() and
                len(word) >= 2]
    
    return np.asarray(keywords)

In [4]:
def word_cloud_from_keywords_frequency(keywords_frequency, file_name, path='./wordclouds/', show_image=False):
    wordcloud = WordCloud(width = 512, height = 512, background_color='white')
    fig = plt.figure(figsize=(20,16),facecolor = 'white', edgecolor='blue')
    plt.imshow(wordcloud.generate_from_frequencies(keywords_frequency), interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad=0)
    
    if show_image:
        plt.show()
    plt.savefig(path+file_name)
    plt.close(fig)

In [5]:
# def column_plot(keywords_frequency)
#     df = pd.DataFrame(lst, columns = ['Word', 'Count'])
#     df.plot.bar(x='Word',y='Count')

In [6]:
def calculate_statistics(keywords):
    total_words = 0
    keywords_dict = dict(collections.Counter(keywords).most_common())
    for k in keywords_dict:
        total_words += keywords_dict[k]

    stats = dict()
    for k in keywords_dict:
        stats[k] = {'count': keywords_dict[k], 'text_frequency': keywords_dict[k]/float(total_words)}    
#     stats['frequency'].most_common(10)
    
    return stats

### 1.1 Count occurrences of predefined phrases/words

**IN-DEVELOPMENT:** Count occurrences of predefined text using 'fuzzy string match', added ```word_match_count``` lambda object. 

In [7]:
def count_normal(text, word, confidence = 0, phrases = False):
    return text.count(word)

def count_fuzzy(text, word, confidence = 95, phrases=False):
    counter = 0
    if len(word) == 0:
        return 0
    # Phrases
    if phrases:
        # Sliding window strategy: create a text_word from text with length word.len 
        #  and by moving 1 character at a time
        #     If match, then skip the next word.len, just to avoid double counting!
        i = 0
        ## DEBUG
#         print('count_fuzzy - word = {0} and, len(text) = {1}, confidence = {2}'.format(word, len(text), confidence))
        print('PALAVRA = {0} - CONFIANCA = {1}'.format(word, confidence))
    
        while i < len(text):
            text_word_last_idx = i+len(word)
            match_confidence = 0
            if text_word_last_idx <= len(text):
                match_confidence = fuzz.ratio(text[i:text_word_last_idx], word)
#                 print('token_set_ratio({0},{1})={2}'.format(text[i:text_word_last_idx], word, match_confidence))
                if match_confidence > confidence:
                    counter += 1
                    print('{0} =~ {1} ({2})'.format(text[i:text_word_last_idx], word, match_confidence))
                    i += len(word)
                else:
                    i += 1
            else:
                break
            ## DEBUG - only about 100 characters
            if(i % int(len(text)/10) == 0):
                print('{} of {} = {:.2f}%'.format(i, len(text), i/len(text)*100))
    else:
        text_vec = text.split()
        for w in text_vec:
            match_confidence = fuzz.ratio(w, word)
            if match_confidence > confidence:
                counter += 1
    return counter

In [8]:
def filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Projeto_tese/', 
                            filter_list_file='lista_palavras_EN.txt', 
                            raw_text_path='./raw_texts/EN/',
                            output_path='./reports_special/EN/',
                            output_file='output_EN.csv',
                            word_match_count=count_normal,
                            confidence=95,
                            phrases=False):
    print('Starting filter_special_keywords')
    filter_words = ''
    try:
        filter_words = open(os.path.join(filter_list_path, filter_list_file), 'r', encoding='utf8').read().split('\n')
        print('successfully read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    except IOError:
        print('failed to read the filter words at {0} named {1}'.format(filter_list_path, filter_list_file))
    
    # assuming there are only the raw texts in the directory
    text_files = [f for f in listdir(raw_text_path) if isfile(join(raw_text_path, f))]
    text_files = list(filter(lambda text_file: text_file.find('.txt') != -1, text_files))
    
    stats_per_file = {}
    for text_file in text_files:
        try:
            text = open(os.path.join(raw_text_path, text_file), 'r').read()
        except IOError:
            print('Failed to open file at {0} named {1}'.format(raw_text_path, text_file))
#         print('Text file: {0}'.format(text_file))
        
        stats = {}
        for filter_word in filter_words:
            stats[filter_word] = word_match_count(text.lower(), filter_word.lower(), confidence, phrases)
            # text.lower().count(filter_word.lower())
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)

        stats_per_file[text_file.replace('.txt','')] = stats
        
    output = open(output_path+output_file, 'w', encoding='utf8')   
    header = u'word,' + u','.join(text_files)+u'\n'
    output.write(header)
    
    
    for filter_word in filter_words:
        output.write(filter_word + ',')
        for text_file in text_files:
#             line = u'%s,%d,%.5f\n' % (filter_word, counter, -1)
            output.write(str(stats_per_file[text_file.replace('.txt','')][filter_word]) + ',')
        output.write('\n')
    output.close()
    print('Finishing filter_special_keywords')

In [9]:
def write_report(path, output_file, keyword_stats):
    output = open(path+output_file, 'w', encoding='utf8')
    header = u'word,count,text_frequency\n'
    output.write(header)
    for k in keyword_stats:
        line = u'%s,%d,%.5f\n' % (k, keyword_stats[k]['count'], keyword_stats[k]['text_frequency'])
        output.write(line)
    output.close()

## 2. Process all PDFs in a given path

**TODO:** This version of the code still doesn't handle pdfs that are in PT and EN

In [10]:
def process_all_PDFs(path='/Users/hmg/Desktop/data/PDF_relatorios_sustentabilidade/', 
                     raw_text_path='./raw_texts/',
                     output_path='./reports/', 
                     wordclouds_path='./wordclouds/',
                     wordclouds=False):
    # We assume there are only pdfs in this directory
    PDFs = [f for f in listdir(path) if isfile(join(path, f))]
    PDFs = list(filter(lambda pdf: pdf.find('.pdf') != -1, PDFs))
    
    PDFs_PT = list(filter(lambda pdf: pdf.find('_EN.pdf') == -1, PDFs))
    PDFs_EN = list(filter(lambda pdf: pdf.find('_EN.pdf') != -1, PDFs))
    
#     counter = 0 # until 259 they process
#     for pdf in PDFs_PT[counter:]:
#         counter = counter + 1
#         print('Processing PDFs_PT {0} of {1} ({2})'.format(counter, len(PDFs_PT), pdf))
#         keywords = extract_keywords(path, pdf, path_output_raw_text=raw_text_path+'PT/')
#         keywords_statistics = calculate_statistics(keywords)
#         write_report(output_path+'PT/', pdf.replace('.pdf', '.csv'), keywords_statistics)
        
#         if wordclouds:
#             word_cloud_from_keywords_frequency(collections.Counter(keywords), 
#                                                pdf.replace('.pdf', '.png'), 
#                                                path=wordclouds_path+'PT/',
#                                                show_image=False)
        
    counter = 0 # until 84 they process
    for pdf in PDFs_EN[counter:]:
        
        print('Processing PDFs_EN {0} of {1} ({2})'.format(counter, len(PDFs_EN), pdf))
        keywords = extract_keywords(path, pdf, path_output_raw_text=raw_text_path+'EN/', stop_words_language='english')
        keywords_statistics = calculate_statistics(keywords)
        write_report(output_path+'EN/', pdf.replace('.pdf', '.csv'), keywords_statistics)
        counter = counter + 1
        if wordclouds:
            word_cloud_from_keywords_frequency(collections.Counter(keywords), 
                                               pdf.replace('.pdf', '.png'), 
                                               path=wordclouds_path+'EN/',
                                               show_image=False)

In [13]:
process_all_PDFs(path='/Users/hmg/Desktop/Data/pdfs_EN/',wordclouds=True)

Processing PDFs_PT 1 of 1 (B3SA_14_2014-EN.pdf)
Processing PDFs_EN 0 of 235 (ERN_13_Non_2012_EN.pdf)




ValueError: We need at least 1 word to plot a word cloud, got 0.

<matplotlib.figure.Figure at 0x1a1894ecf8>

## Sandbox

In [None]:
# print(len(PDFs))
# print(len(PDFs_PT))
# print(len(PDFs_EN))

In [16]:
# keywords = extract_keywords('/Users/hmg/Downloads/', '3M_RS2014.pdf', '/Users/hmg/Downloads/')
# keywords_frequency = collections.Counter(keywords)
# 
# keywords_frequency.most_common(10)

In [15]:
# keywords = extract_keywords('/Users/hmg/Desktop/data/PDF_relatorios_sustentabilidade/FIBR_7_2012_EN.pdf', 
#                             stop_words_language='english')
# keywords_frequency = collections.Counter(keywords)

# word_cloud_from_keywords_frequency(keywords_frequency, file_name='hey.png', plot=False)

# keywords_frequency.most_common(10)

In [None]:
# calculate_statistics(keywords)

In [86]:
# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Projeto_tese/', 
#                             filter_list_file='lista_palavras_EN.txt', 
#                             raw_text_path='./raw_texts/EN/',
#                             output_path='./reports_special/EN/')

# filter_special_keywords(filter_list_path='/Users/hmg/Dropbox/veve e heitor/Projeto_tese/', 
filter_special_keywords(filter_list_path='./', 
                            filter_list_file='lista_frases_v3_dev.txt', 
                            raw_text_path='./raw_texts/PT_dev/',
                            output_path='./reports_special/PT/',
                            output_file='output_PT_dev_fuzzy_phrases95.csv',
                            word_match_count=count_fuzzy,
                            confidence=95,
                            phrases=True
                            )

Starting filter_special_keywords
successfully read the filter words at ./ named lista_frases_v3_dev.txt
PALAVRA = impactos na biodiversidade - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 = 20.00%
21522 of 71747 = 30.00%
28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = valor da biodiversidade - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 = 20.00%
21522 of 71747 = 30.00%
28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = gestão da biodiversidade - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 = 20.00%
21522 of 71747 = 30.00%
28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = estratégia de biodiversidade - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 

28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = animais selvagens - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 = 20.00%
21522 of 71747 = 30.00%
28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = árvores nativas - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 = 20.00%
21522 of 71747 = 30.00%
28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
árvores nativas =~ árvores nativas (100)
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = mata nativa - CONFIANCA = 95
7174 of 71747 = 10.00%
14348 of 71747 = 20.00%
21522 of 71747 = 30.00%
28696 of 71747 = 40.00%
35870 of 71747 = 50.00%
43044 of 71747 = 59.99%
50218 of 71747 = 69.99%
57392 of 71747 = 79.99%
64566 of 71747 = 89.99%
PALAVRA = inventário natural - CONFIANCA = 

In [18]:
# Tests with fuzzy string match
# fuzz.ratio(str.lower(u'Casarão'), str.lower(u'Caserão'))
# fuzz.token_set_ratio(str.lower(u'Casarão'), str.lower(u'Casârão'))

# normal_count = count_normal("mama mia this is a mama and there is no mia in this mama", "mãma")
# fuzzy_count = count_fuzzy("mama mia this is a mama and there is no mia in this mama", "mãma", 80)

# print('normal = {0}, fuzzy = {1}'.format(normal_count, fuzzy_count))

86

In [92]:
fuzz.ratio(str.lower(u'vacao da biodiversidade'), str.lower(u''))
# fuzz.token_set_ratio(str.lower(u'EN-13'), str.lower(u'EN-'))

92