In [1]:
from PyPDF2 import PdfReader
import os
import pandas as pd
import glob

In [2]:
def get_pdfs_in_directory(directory_path):
    # Use glob to get all PDF files in the directory
    pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))
    return pdf_files
pdf_not_relevant = get_pdfs_in_directory('test_data/not_relevant')
pdf_relevant = get_pdfs_in_directory('test_data/relevant')

In [3]:
def text_extract(reader):
    doc_text = ""
    num_pages = len(reader.pages)
    for i in range(num_pages):
        page_text = reader.pages[i].extract_text() + " "
        doc_text += page_text
    return doc_text

def get_info(path,relevant):
    with open(path, 'rb') as f:
        pdf = PdfReader(f)
        info = pdf.metadata
        author = info.author if '/Author' in info else None
        title = info.title if '/Title' in info else None
        subject = info.subject if '/Subject' in info else None
        keywords = info['/Keywords'] if '/Keywords' in info else None
        number_of_pages = len(pdf.pages)
        text = text_extract(pdf)
        relevance = 1 if relevant else 0
    return [author, title, subject, keywords, number_of_pages, text, relevance]

In [5]:
all_info = []
for pdf in pdf_not_relevant:
	all_info.append(get_info(pdf,relevant = False))
for pdf in pdf_relevant:
	all_info.append(get_info(pdf,relevant = True))
harvest_nlp = pd.DataFrame(all_info,columns = ['author','title','subject','keywords','num_pages','text','is_relavent'])

In [6]:
harvest_nlp.head()

Unnamed: 0,author,title,subject,keywords,num_pages,text,is_relavent
0,ECDC,Effectiveness and cost-effectiveness of antena...,,,37,TECHNICAL REPORT\nEffectiveness and cost- \n...,0
1,,,,,45,SPECIAL REPORT\nImplementing the \nDublin Dec...,0
2,,,,,15,SPECIAL REPORT\nThematic report: \nSex worker...,0
3,,,,,36,TECHNICAL REPORT\nRisk assessment on HIV in G...,0
4,ECDC,Chlamydia - Annual epidemiological report,"Chlamydia, surveillance, monitoring","Chlamydia, surveillance, monitoring",9,\nSuggested citation: European Centre for Dis...,0


## Remove/Replace Nulls

In [7]:
harvest_nlp.isna().sum()

author         43
title          48
subject        56
keywords       50
num_pages       0
text            0
is_relavent     0
dtype: int64

In [54]:
len(harvest_nlp)

72

In [14]:
harvest_nlp.columns

Index(['author', 'title', 'subject', 'keywords', 'num_pages', 'text',
       'is_relavent', 'text_modified'],
      dtype='object')

In [23]:
#replace empty strings with Unknown
harvest_nlp.replace('','Unknown',inplace=True)
harvest_nlp.replace(' ','Unknown',inplace=True)

#fill na values, drop ones that aren't necessary (only dropped one since there isn't too much data)
harvest_nlp.fillna({'author': 'Unknown',
						'title': 'Unknown',
						'subject': 'Unknown',
						'keywords': 'Unknown',
						},inplace=True)

In [24]:
harvest_nlp.isna().sum()

author           0
title            0
subject          0
keywords         0
num_pages        0
text             0
is_relavent      0
text_modified    0
dtype: int64

## Preprocessing/Cleaning

In [25]:
harvest_nlp['text_modified'] = harvest_nlp['text'].apply(lambda col: col.lower())
patterns = {
	### UNDERSTAND CONTEXT FIRST BEFORE CHANGING, HELPS TO MAKE MORE INFORMED DECISIONS
	#remove \n and \r and \t
	r'\n|\r|\t': ' ',
	
}
for pattern, replacement in patterns.items():
    harvest_nlp['text_modified'] = harvest_nlp['text_modified'].str.replace(pattern,replacement,regex=True)

harvest_nlp

Unnamed: 0,author,title,subject,keywords,num_pages,text,is_relavent,text_modified
0,ECDC,Effectiveness and cost-effectiveness of antena...,Unknown,Unknown,37,TECHNICAL REPORT\nEffectiveness and cost- \n...,0,technical report effectiveness and cost- ef...
1,Unknown,Unknown,Unknown,Unknown,45,SPECIAL REPORT\nImplementing the \nDublin Dec...,0,special report implementing the dublin decla...
2,Unknown,Unknown,Unknown,Unknown,15,SPECIAL REPORT\nThematic report: \nSex worker...,0,special report thematic report: sex workers ...
3,Unknown,Unknown,Unknown,Unknown,36,TECHNICAL REPORT\nRisk assessment on HIV in G...,0,technical report risk assessment on hiv in gr...
4,ECDC,Chlamydia - Annual epidemiological report,"Chlamydia, surveillance, monitoring","Chlamydia, surveillance, monitoring",9,\nSuggested citation: European Centre for Dis...,0,suggested citation: european centre for dise...
...,...,...,...,...,...,...,...,...
67,ECDC,Dublin Declaration evidence brief: HIV testing...,"Dublin Declaration, HIV testing",HIV testing; Dublin Declaration; partnership t...,11,Suggested citation: HIV testing in Europe and ...,1,suggested citation: hiv testing in europe and ...
68,Fabrice Donguy,AER_template.docx,Unknown,Unknown,8,\n \n \nSuggested citation: European Centre f...,1,suggested citation: european centre for ...
69,Rumila Edward,2024-WCP-0017 Draft.docx,Not specified,Communicable disease threats report,13,\n \nEuropean Centre for Disease Prevention a...,1,european centre for disease prevention and...
70,ECDC,Continuum of HIV care: 2022 progress report,HIV,HIV; continuum of HIV care; continuum of care;...,69,SPECIAL REPORT\nContinuum of HIV care\n \nMoni...,1,special report continuum of hiv care monitor...


In [27]:
harvest_nlp['text_modified'][0]

'technical  report effectiveness and cost-   effectiveness of antenatal screening  for hiv, hepatitis b, syphilis and   rubella susceptibility   literature review www.ecdc.europa.eu ecdc  technical report   effectiveness and cost -effectiveness of  antenatal screening for hiv, hepatitis b,  syphilis and rubella susceptibility   literature  review                         ii this report was commissioned by the european centre for disease prevention and control (ecdc) and coordinated  by otilia mårdh, tarik derrough and andrew amato -gauci.   the report was produced under contract ecdc/2012/052 with the national institute for health and welfare (t hl)  by carita savolainen -kopra, mia kontio, marjukka mäkelä, kirsi liitsola, jukka lindeman, jaana isojärvi, heljä -marja  surcel, irja davidkin, henrikki brummer -korvenkontio, eija hiltunen -back, hanna nohynek, tuija leino, markku  kuusi, and mika salminen.   helena de carvalho gomes and ana -belen escriva are acknowledged for internal ecdc