In [1]:
import pandas as pd
import numpy as np

from PyPDF2 import PdfFileReader
from tika import parser
from datetime import datetime

import re 
import string

import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import os
cwd = os.getcwd().replace('\\', '/') + '/'

In [2]:
path_data = cwd + 'Data/'

#### Preprocessing

##### Function

In [3]:
def Preprocess_PDF(file, print_text=False) : 

    print(file)
    text = parser.from_file(path_data + file)['content']

    # get the text content
    text = text.split('Legal Information and Disclosures')[0]
    text = text.split('Re:')[-1]
    text = text.lower() # don't lower case names e.g. Warren Buffett

    # remove url
    text = re.sub(r'http\S+', '', text)

    # replace special characters
    text = re.sub('oaktree capital management l.p. all rights reserved', '', text)
    text = re.sub("(\n)\d{1}\s", '', text) # e.g. \n2 for page 2

    to_replace = ['\n', ':', '*', "'", "’", '“', '”', '.  .  .', '. . .', '. .', '[', ']', 
                 'l.p. all rights reserved', 'oaktree capital management', '\uf0b7',
                 'follow us', 'l.p.', 'all rights reserved',
                 ]
    for char in to_replace : text = text.replace(char, '') 
    text = text.replace('-', ' ').replace('—', ' ').replace('–', ' ')
    text = re.sub(r"[,\!\?\%\(\)\/\;\"]", " ", text)
    text = re.sub("(©)+\s\d{4}\s",'', text) # e.g. @2017
    text = text.replace('©', ' ')
    
    # remove white space
    text = " ".join(text.strip().split())

    # Memo date
    written_date = re.findall("(\w+\s\d+\s\d{4})", text)[-1]
    if file[:10].replace('-', '').isnumeric() : date = file[:10] # '-'.join(file.split('-')[:3])
    else : date = datetime.strptime(written_date, "%B %d %Y").strftime("%Y-%m-%d")
    text = text.replace(written_date, '')
    text = text.strip()
    
    full_text = text
    
    # remove special characters
    # text = re.sub(r'\W+', ' ', text)
    text = re.sub('[^.a-zA-Z0-9 \n\.]', '', text) # keep dots
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # remove stop words
    text = ' '.join([w for w in text.split() if w not in stopwords])
    
    # lemmatize
    text = ' '.join([WordNetLemmatizer().lemmatize(w) for w in text.split()])

    # remove 1 and 2-letter words
    text = re.sub(r'\b\w{1}\b', ' ', text)
    text = re.sub(r'\b\w{2}\b', ' ', text)
    
    # replace multiple dots
    text = re.sub("[.]{2,}", '.', text)
    text = re.sub("[.]\s[.]", '.', text)
    
    # remove white space
    text = " ".join(text.strip().split())
    
    if print_text : print(text)
        
    return text, full_text, date

In [5]:
files = os.listdir(path_data)
file = files[0]
text, full_text, date = Preprocess_PDF(file, print_text=True)

1990-10-12-the-route-to-performance.pdf
route performance seek investment performance average achieve remains major question. view subject come increasingly focus year gone two event late september especially juxtaposition made even clearer best pursue superior results. first article wall street journal prominent money management firm lagging performance. equity result basis point behind twelve month august result five year performance fallen behind well. president firm explained bold weighting werent wrong early. explanation strongly disagree want top money manager willing bottom too. call mind convertible mutual fund discussed second quarter letter convertible clients. fund held large amount common stock first eight month cash that. result return basis point better average convertible fund year . ahead second place fund. next half year tactic equally divergent . wrong time producing performance far enough behind negate majority achievement pull month result well back pack. observatio

In [6]:
# Count number of pages in total

num_pages = 0
num_memo = 0 # 132 in total
for file in files : 
    pdf = PdfFileReader(open(path_data + file, 'rb'))
    try : 
        num_pages += int(parser.from_file(path_data + file)['metadata']['xmpTPg:NPages'])
        num_memo +=1
    except : pass
    
print('From 1990 to 2020, Howard Mark wrote %i memos, totalling %i pages' % (num_memo, num_pages))



From 1990 to 2020, Howard Mark wrote 132 memos, totalling 1459 pages


In [7]:
# save results
preprocess_memos=False # take 35s
if preprocess_memos :

    df = pd.DataFrame()
    for file in files : 

        text1 = parser.from_file(path_data + file)['content']
        print(len(text1))

        text, full_text, date = Preprocess_PDF(file, print_text=False)
        df.loc[date, 'preprocessed_memo'] = text
        df.loc[date, 'full_sentences'] = full_text

        print(len(text), '\n')

    df = df.sort_index()
    
    save=False
    if save : 
        df.to_csv(cwd + 'Output/' + 'Preprocessed_Memos.csv')

else : 
    df = pd.read_csv(cwd + 'Output/' + 'Preprocessed_Memos.csv', index_col=0)
        
df

Unnamed: 0,preprocessed_memo,full_sentences
1990-10-12,route performance seek investment performance ...,the route to performance we all seek investmen...
1991-04-11,first quarter performance mood swing security ...,first quarter performance the mood swings of t...
1992-10-08,microeconomics supply demand convertible two p...,microeconomics 101 supply demand and convertib...
1993-02-15,value prediction whered rain come anyone clien...,the value of predictions or whered all this ra...
1994-01-24,addendum third quarter client letter howard ur...,addendum to third quarter client letter from h...
...,...,...
2020-03-19,latest update going provide information view t...,latest update im going to do all i can to prov...
2016-01-19,market know buddy sandy airline pilot. asked d...,what does the market know my buddy sandy was a...
2020-03-31,way last six week market seen best time worst ...,which way now in the last six weeks the market...
2017-09-07,yet july generated response year ive writing m...,yet again there they go again again of july 26...
