In [1]:
import numpy as np
import pandas as pd
import PyPDF2
import torch
import time
import googletrans as gt
from tqdm import tqdm

In [2]:
import os

In [3]:
tqdm.pandas()

# Paths

In [4]:
datos             = '../Data/RAW/'
datos_es          = '../Data/RAW_ES/'
raw_pages         = '../Data/PREP/raw_pages.parquet'
raw_sentences     = '../Data/PREP/sentences.parquet'
raw_en_sentences  = '../Data/PREP/sentences_en.parquet'
raw_es_sentences  = '../Data/PREP/sentences_es.parquet'
data_sentiment    = '../Data/PREP/sentiment.parquet'
data_sent_full    = '../Data/PREP/sentiment_full.parquet'

In [12]:
datos_new          = '../Data/RAW_NEW/'
raw_new_sentences  = '../Data/PREP/sentences_new.parquet'
data_sent_new      = '../Data/PREP/sentiment_new.parquet'

# Datos en crudo

### New Data

In [9]:
pdf_paths_new = []

bancos_new = os.listdir(datos_new)

for b in bancos_new:
    pdfs = []
    if b != '.DS_Store':
        pdfs = os.listdir(datos_new+b)
        
    for pdf in pdfs:
        if pdf != '.DS_Store':
            pdf_paths_new.append(datos_new+b+'/'+pdf)

In [11]:
len(pdf_paths_new)

83

### Español

In [5]:
bancos_es = os.listdir(datos_es)
print(len(bancos_es))

3


In [6]:
pdf_paths_es = []

for b in bancos_es:
    pdfs = []
    if b != '.DS_Store':
        pdfs = os.listdir(datos_es+b)
        
    for pdf in pdfs:
        if pdf != '.DS_Store':
            pdf_paths_es.append(datos_es+b+'/'+pdf)

In [7]:
pdf_paths_es

['../Data/RAW_ES/ES_ABANCA/ABANCA_2014.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2015.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2016.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2017.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2018.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2019.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2020.pdf',
 '../Data/RAW_ES/ES_ABANCA/ABANCA_2021.pdf',
 '../Data/RAW_ES/ES_BANCO_VALENCIA/VALENCIA_2010.pdf',
 '../Data/RAW_ES/ES_BANCO_VALENCIA/VALENCIA_2011.pdf',
 '../Data/RAW_ES/ES_BANCO_VALENCIA/VALENCIA_2012.pdf',
 '../Data/RAW_ES/ES_CATALUNYA_BANC/CATALUNYA_BANC_2013.pdf',
 '../Data/RAW_ES/ES_CATALUNYA_BANC/CATALUNYA_BANC_2014.pdf',
 '../Data/RAW_ES/ES_CATALUNYA_BANC/CATALUNYA_BANC_2015.pdf']

### Inglés

In [8]:
bancos = os.listdir(datos)
print(len(bancos))

20


In [9]:
pdf_paths = []

for b in bancos:
    pdfs = []
    if b != '.DS_Store':
        pdfs = os.listdir(datos+b)
        
    for pdf in pdfs:
        if pdf != '.DS_Store':
            pdf_paths.append(datos+b+'/'+pdf)

In [10]:
pdf_paths[-5:]

['../Data/RAW/SANTANDER/SANT-annual-report-en-2020.pdf',
 '../Data/RAW/SANTANDER/SANT-annual-report-en-2021.pdf',
 '../Data/RAW/WGZ_BANK/WGZ_BANK_Annual-report-2013.pdf',
 '../Data/RAW/WGZ_BANK/WGZ_BANK_Annual-report-2014.pdf',
 '../Data/RAW/WGZ_BANK/WGZ_BANK_Annual-report-2015.pdf']

In [11]:
print(len(pdf_paths))

192


## Funciones de Preprocesado

In [6]:
def read_raw_pdf(path, verbose=0):
    import PyPDF2
    try:
        pdfReader = PyPDF2.PdfFileReader(path)
        #Discerning the number of pages will allow us to parse through all the pages.
        num_pages = pdfReader.numPages
    except:
        if verbose > 1: 
            print(f'Error de lectura de PDF en: {path}')
        raise
    count = 0
    pages = []
    if verbose > 1: print(f'\n{path.split("/")[-1]}')
    while count < num_pages:
        if verbose > 1: print(f'Page {count+1}/{num_pages}', end="\r")
        pageObj = pdfReader.getPage(count)
        pages.append(pageObj.extractText())
        count += 1
        
    pdf_dict = {
        'path':path,
        'pages':pages,
    }
    return pdf_dict

def prepare_PDF_dataframe(raw_texts):
    import pandas as pd
    data = pd.DataFrame(raw_texts)
    data['bank'] = data['path'].apply(lambda x: x.split('/')[3])
    data['year'] = data['path'].apply(lambda x: x[-8:-4])
    data = data.explode('pages').reset_index().reset_index()
    data['page'] = data.groupby('index').rank('max').rename(columns={'level_0':'page'})['page']
    data['page'] = data['page'].astype('int')
    data = (data.drop(columns=['level_0','index','path'])
               .set_index(['bank','year','page'])
               .rename(columns={'pages':'text'})
          )
    return data

def clean_text(text, esp=False):
    import re
    text = re.sub(r"\n\n", ".", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(":", " ", text)
    text = re.sub(r"[^a-zA-Z.\s:ñÑáéíóúÁÉÍÓÚ]", "", text)
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    if text == '':
        return None
    if esp:
        try:
            text=translate_ES(text)
        except:
            return None
    return list(filter(None, text.split('.')))

def tokenize_phrases(df_data, esp=False):
    tqdm.pandas()
    if esp: print('--- Realizando traducción al inglés...')
    df_data['sentences'] = df_data['text'].progress_apply(lambda x: clean_text(x, esp=esp))
    df_data = df_data.dropna()
    df_sentence = df_data.explode('sentences').dropna()
    df_sentence['sentence'] = df_sentence.groupby(df_sentence.index).rank('max')['sentences']
    df_sentence['sentence'] = df_sentence['sentence'].astype('int')
    df_sentence.set_index('sentence', append=True, inplace=True)
    df_sentence.drop(columns=['text'], inplace=True)
    return df_sentence

translator = gt.Translator()

def translate_ES(sentence):
    return translator.translate(sentence, dest='en', src='es').text

def process_PDFs(list_of_paths, 
                 timeit=True, 
                 verbose=0, 
                 esp=False,
                 save_path=None,
                 save_raw_text=None):
    import time
    total_times = dict()
    if timeit: 
        print('Cronometrando lectura de PDFs...')
        start_time = time.time()

    raw_texts = []
    for path in list_of_paths:
        try:
            pdf_dict = read_raw_pdf(path, verbose=verbose)
        except:
            continue
        raw_texts.append(pdf_dict)

    if timeit: 
        end_time = time.time()
        total_times["lectura"] = end_time - start_time
        if verbose: print(f'--- Tiempo de lectura (s): {end_time - start_time}')
        
    # Textos a DataFrame:
    if verbose: print(f'Preparando dataframe...')
    raw_text_data = prepare_PDF_dataframe(raw_texts)
    
    # if timeit: 
    #     end_time = time.time()
    #     total_times["creacion_dataframe"] = end_time - total_times["lectura"]
        
    # Guardado de texto RAW:
    if save_raw_text: 
        raw_text_data.to_parquet(save_raw_text)
    
    if verbose: print(f'Tokenizando frases...')
    df_phrases = tokenize_phrases(raw_text_data, esp=esp)
    
    # Guardando dataset:
    if save_path:
        print('Guardando dataset...')
        df_phrases.to_parquet(save_path)
        print('Dataset guardado!')
    
    return df_phrases, total_times

def filter_phrases_size(df_sentence, min_words=10, tensor_limit=512):
    df_sentence['length'] = df_sentence['sentences'].apply(lambda x: len(x.strip().split(' ')))
    return df_sentence[(df_sentence.length >= min_words) & (df_sentence.length <= tensor_limit)]

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis=1)[:, None])
    return e_x / np.sum(e_x, axis=1)[:, None]

def sentimentAnalysis_full(text_payload, tokenizer, model, tensor_limit=512):
    inputs = tokenizer(text_payload, return_tensors="pt")["input_ids"][:,:tensor_limit]
    logits = model(inputs).logits
    return softmax(np.array(logits.detach()))[0]

def get_phrase_sentiment(dataset, min_words=10, tensor_limit=512, save_sentiment_path=None, timeit=True):
    import time
    import pandas as pd
    from tqdm import tqdm
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    MODEL_PATH = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
    pd.options.mode.chained_assignment = None #'warn'
    tqdm.pandas()
    total_times = dict()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
    
    if timeit:
        print('Cronometrando procesado de sentiment de frases...')
        start_sentiment_time = time.time()
        
    dataset = filter_phrases_size(dataset, min_words=min_words, tensor_limit=tensor_limit)
    dataset['sentiment'] = dataset.sentences.progress_apply(
        lambda x: sentimentAnalysis_full(x, tokenizer, model, tensor_limit=tensor_limit)
    )
    
    if timeit:
        end_sentiment_time = time.time()
        total_times['sentiment'] = end_sentiment_time - start_sentiment_time
    
    dataset['negative'] = dataset['sentiment'].progress_apply(lambda x: x[0])
    dataset['neutral']  = dataset['sentiment'].progress_apply(lambda x: x[1])
    dataset['positive'] = dataset['sentiment'].progress_apply(lambda x: x[2])
    df_export = dataset.drop(columns='sentiment')
    
    if save_sentiment_path:
        df_export.to_parquet(save_sentiment_path)
    
    return df_export, total_times

## Proceso de lectura

#### Dataset Nuevo:

In [13]:
data_new, total_times_new = process_PDFs(pdf_paths_new, esp=False, verbose=2, save_path=raw_new_sentences)

Cronometrando lectura de PDFs...

Barclays-PLC-Annual-Report-1998.pdf
Page 1/178
Barclays-PLC-Annual-Report-1999.pdf
Page 1/185Page 2/185Page 3/185Page 4/185Page 5/185Page 6/185Page 7/185Page 8/185Page 9/185Page 10/185Page 11/185Page 12/185Page 13/185



Page 185/185
Barclays-PLC-Annual-Report-2000.pdf
Page 184/184
Barclays-PLC-Annual-Report-2001.pdf
Page 215/215
Barclays-PLC-Annual-Report-2002.pdf
Page 54/248
Barclays-PLC-Annual-Report-2003.pdf
Page 232/232
Barclays-PLC-Annual-Report-2004.pdf
Page 256/256
Barclays-PLC-Annual-Report-2005.pdf
Page 320/320
Barclays-PLC-Annual-Report-2006.pdf
Page 310/310
Barclays-PLC-Annual-Report-2007.pdf
Page 296/296
Barclays-PLC-Annual-Report-2008.pdf
Page 330/330
Barclays-PLC-Annual-Report-2009.pdf
Page 348/348
Barclays-PLC-Annual-Report-2010.pdf
Page 288/288
Barclays-PLC-Annual-Report-2011.pdf
Page 286/286
Barclays-PLC-Annual-Report-2012.pdf
Page 356/356
Barclays-PLC-Annual-Report-2013.pdf
Page 436/436
Barclays-PLC-Annual-Report-2014.pdf
Page 348/348
Barclays-PLC-Annual-Report-2015.pdf
Page 356/356
Barclays-PLC-Annual-Report-2016.pdf
Page 380/380
Barclays-PLC-Annual-Report-2017.pdf
Page 328/328
Barclays-PLC-Annual-Report-2018.pdf
Page 364/364
Barclays-PLC-Annual-Report-2019.pdf
Page 344/344
Barclays

100%|██████████████████████████████████████████████████████████████████████████| 23845/23845 [00:04<00:00, 5842.47it/s]


Guardando dataset...
Dataset guardado!


#### Dataset Español:

In [13]:
data_es, total_times_es = process_PDFs(pdf_paths_es, esp=True, verbose=2, save_path=raw_es_sentences)

Cronometrando lectura de PDFs...

ABANCA_2014.pdf
Page 275/275
ABANCA_2015.pdf
Page 267/267
ABANCA_2016.pdf
Page 223/223
ABANCA_2017.pdf
Page 417/417
ABANCA_2018.pdf
Page 499/499
ABANCA_2019.pdf
Error de lectura de PDF en: ../Data/RAW_ES/ES_ABANCA/ABANCA_2020.pdf
Error de lectura de PDF en: ../Data/RAW_ES/ES_ABANCA/ABANCA_2021.pdf

VALENCIA_2010.pdf
Page 309/447



Page 447/447
VALENCIA_2011.pdf
Page 477/477
VALENCIA_2012.pdf
Page 454/454
CATALUNYA_BANC_2013.pdf
Page 204/204
CATALUNYA_BANC_2014.pdf
Page 197/197
CATALUNYA_BANC_2015.pdf
--- Tiempo de lectura (s): 154.37993121147156
Preparando dataframe...
Tokenizando frases...
--- Realizando traducción al inglés...


100%|██████████████████████████████████████████████████████████████████████████████| 4106/4106 [40:49<00:00,  1.68it/s]


Guardando dataset...
Dataset guardado!


#### Dataset Inglés:

In [14]:
data_en, total_times_en = process_PDFs(pdf_paths, esp=False, verbose=2, save_path=raw_en_sentences)

Cronometrando lectura de PDFs...

ABBEY_2003.pdf
Page 1/78Page 2/78Page 3/78Page 4/78Page 5/78Page 6/78Page 7/78Page 8/78



Page 78/78
ABBEY_2004.pdf
Page 127/127
ABBEY_2005.pdf
Page 164/164
ABBEY_2006.pdf
Page 177/177
ABBEY_2007.pdf
Page 135/135
ABBEY_2008.pdf
Page 155/155
annual-report-bankia-2014.pdf
Page 200/200
annual-report-bankia-2015.pdf
Page 200/200
annual-report-bankia-2016.pdf
Page 216/216
annual-report-bankia-2017.pdf
Page 232/232
annual-report-bankia-2018.pdf
Page 266/266
annual-report-bankia-2019.pdf
Page 78/78
annual-report-bankia-2020.pdf
Page 186/186
BANKINTER_2010.pdf
Page 155/155
BANKINTER_2011.pdf
Page 155/155
BANKINTER_2012.pdf
Page 136/136
BANKINTER_2013.pdf
Page 143/143
BANKINTER_2014.pdf
Page 209/209
BANKINTER_2015.pdf
Page 172/172
BANKINTER_2016.pdf
Page 179/179
BANKINTER_2017.pdf
Page 183/183
BANKINTER_2018.pdf
Page 157/157
BANKINTER_2019.pdf
Page 122/122
BANKINTER_2020.pdf
Page 134/134
BANKINTER_2021.pdf
Page 115/115
Roczny_2008.pdf
Page 112/112
Roczny_2009.pdf
Page 120/120
Roczny_2010.pdf
Page 106/106
Roczny_2011.pdf
Page 115/115
Roczny_2012.pdf
Page 116/116
Roczny_2013.pdf
Page 

100%|██████████████████████████████████████████████████████████████████████████| 41756/41756 [00:06<00:00, 6474.57it/s]


Guardando dataset...
Dataset guardado!


In [19]:
print('-- Tiempo de lectura de datos en ES (min): ',total_times_es['lectura']/60,'\n' 
      '-- Tiempo de lectura de datos en EN (min): ',total_times_en['lectura']/60)

-- Tiempo de lectura de datos en ES (min):  2.572998853524526 
-- Tiempo de lectura de datos en EN (min):  27.586885543664298


In [23]:
complete_dataset = pd.concat([data_en,data_es])
complete_dataset

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sentences
bank,year,page,sentence,Unnamed: 4_level_1
ABBEY_NATIONAL,2003,1,1,Full Year Financial Results FebruaryFebruaryFe...
ABBEY_NATIONAL,2003,2,28,Full Year Financial Results February February ...
ABBEY_NATIONAL,2003,2,20,Summarised consolidated profit and loss account
ABBEY_NATIONAL,2003,2,14,Personal Financial Services trading profit be...
ABBEY_NATIONAL,2003,2,15,Portfolio Business Unit profit loss before ta...
...,...,...,...,...
ES_CATALUNYA_BANC,2015,188,8,Apart from the complaints and claims managed ...
ES_CATALUNYA_BANC,2015,189,1,Bank of Spain CNMV Bank of Spain CNMV resolved...
ES_CATALUNYA_BANC,2015,190,2,Diligence to state that this document formed b...
ES_CATALUNYA_BANC,2015,190,1,Date of the formulation of the annual accounts...


In [27]:
complete_dataset.to_parquet(raw_sentences)

## Sentiment Analysis

In [55]:
df_sentiment, times = get_phrase_sentiment(complete_dataset, min_words=5, save_sentiment_path=data_sent_full, )

Cronometrando procesado de sentiment de frases...


  7%|████▉                                                                    | 50954/749695 [15:52<4:08:42, 46.82it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
100%|████████████████████████████████████████████████████████████████████████| 749695/749695 [4:00:17<00:00, 52.00it/s]
100%|██████████████████████████████████████████████████████████████████████| 749695/749695 [00:00<00:00, 996269.82it/s]
100%|██████████████████████████████████████████████████████████████████████| 749695/749695 [00:00<00:00, 998281.11it/s]
100%|██████████████████████████████████████████████████████████████████████| 749695/749695 [00:00<00:00, 995480.69it/s]


In [57]:
print('-- Tiempo de procesado de sentiment de frases (min): ',times['sentiment']/60)

-- Tiempo de procesado de sentiment de frases (min):  240.30573924779893


In [14]:
df_sentiment_new, times_new = get_phrase_sentiment(data_new, min_words=5, save_sentiment_path=data_sent_new, )

Cronometrando procesado de sentiment de frases...


 12%|████████▉                                                                | 55507/452800 [18:53<3:48:51, 28.93it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
100%|████████████████████████████████████████████████████████████████████████| 452800/452800 [2:36:02<00:00, 48.36it/s]
100%|█████████████████████████████████████████████████████████████████████| 452800/452800 [00:00<00:00, 1004738.51it/s]
100%|█████████████████████████████████████████████████████████████████████| 452800/452800 [00:00<00:00, 1030705.95it/s]
100%|██████████████████████████████████████████████████████████████████████| 452800/452800 [00:00<00:00, 988082.67it/s]


In [16]:
df_sentiment = pd.read_parquet(data_sent_full)

In [19]:
df_sentiment_final = pd.concat([df_sentiment, df_sentiment_new]) 
df_sentiment_final

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sentences,length,negative,neutral,positive
bank,year,page,sentence,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABBEY_NATIONAL,2003,1,1,Full Year Financial Results FebruaryFebruaryFe...,18,0.000068,0.999884,0.000049
ABBEY_NATIONAL,2003,2,28,Full Year Financial Results February February ...,53,0.000074,0.999874,0.000052
ABBEY_NATIONAL,2003,2,20,Summarised consolidated profit and loss account,6,0.000116,0.999831,0.000053
ABBEY_NATIONAL,2003,2,14,Personal Financial Services trading profit be...,9,0.000086,0.999869,0.000045
ABBEY_NATIONAL,2003,2,15,Portfolio Business Unit profit loss before ta...,9,0.997227,0.002284,0.000489
...,...,...,...,...,...,...,...,...
UNICREDIT,2021,840,14,VaR Value at Risk A measure of the risk of po...,31,0.102616,0.896958,0.000427
UNICREDIT,2021,840,18,assets for a certain period of time until it r...,20,0.000078,0.999863,0.000058
UNICREDIT,2021,843,1,Annual Report and Accounts Contacts UniCredit S,7,0.000072,0.999887,0.000041
UNICREDIT,2021,843,2,Head Office in Milan P iazza Gae Aulenti Towe...,11,0.000070,0.999874,0.000056


In [20]:
df_sentiment_final.to_parquet(data_sent_full)