In [1]:
from PyPDF2 import PdfReader
import os
import pandas as pd
import glob

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
import re

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report

In [2]:
def get_pdfs_in_directory(directory_path):
    # Use glob to get all PDF files in the directory
    pdf_files = glob.glob(os.path.join(directory_path, '*.pdf'))
    return pdf_files
pdf_not_relevant = get_pdfs_in_directory('test_data/not_relevant')
pdf_relevant = get_pdfs_in_directory('test_data/relevant')

In [3]:
def text_extract(reader):
    doc_text = ""
    num_pages = len(reader.pages)
    for i in range(num_pages):
        page_text = reader.pages[i].extract_text() + " "
        doc_text += page_text
    return doc_text

def get_info(path,relevant):
    with open(path, 'rb') as f:
        pdf = PdfReader(f)
        info = pdf.metadata
        author = info.author if '/Author' in info else None
        title = info.title if '/Title' in info else None
        subject = info.subject if '/Subject' in info else None
        keywords = info['/Keywords'] if '/Keywords' in info else None
        number_of_pages = len(pdf.pages)
        text = text_extract(pdf)
        relevance = 1 if relevant else 0
    return [author, title, subject, keywords, number_of_pages, text, relevance]

In [4]:
all_info = []
for pdf in pdf_not_relevant:
	all_info.append(get_info(pdf,relevant = False))
for pdf in pdf_relevant:
	all_info.append(get_info(pdf,relevant = True))
harvest_nlp = pd.DataFrame(all_info,columns = ['author','title','subject','keywords','num_pages','text','is_relevant'])

In [5]:
harvest_nlp.head()

Unnamed: 0,author,title,subject,keywords,num_pages,text,is_relevant
0,ECDC,Effectiveness and cost-effectiveness of antena...,,,37,TECHNICAL REPORT\nEffectiveness and cost- \n...,0
1,,,,,45,SPECIAL REPORT\nImplementing the \nDublin Dec...,0
2,,,,,15,SPECIAL REPORT\nThematic report: \nSex worker...,0
3,,,,,36,TECHNICAL REPORT\nRisk assessment on HIV in G...,0
4,ECDC,Chlamydia - Annual epidemiological report,"Chlamydia, surveillance, monitoring","Chlamydia, surveillance, monitoring",9,\nSuggested citation: European Centre for Dis...,0


## Remove/Replace Nulls

In [6]:
harvest_nlp.isna().sum()

author         43
title          48
subject        56
keywords       50
num_pages       0
text            0
is_relevant     0
dtype: int64

In [7]:
len(harvest_nlp)

72

In [8]:
harvest_nlp.columns

Index(['author', 'title', 'subject', 'keywords', 'num_pages', 'text',
       'is_relevant'],
      dtype='object')

In [9]:
#replace empty strings with Unknown
harvest_nlp.replace('','Unknown',inplace=True)
harvest_nlp.replace(' ','Unknown',inplace=True)

#fill na values, drop ones that aren't necessary (only dropped one since there isn't too much data)
harvest_nlp.fillna({'author': 'Unknown',
						'title': 'Unknown',
						'subject': 'Unknown',
						'keywords': 'Unknown',
						},inplace=True)

In [10]:
harvest_nlp.isna().sum()

author         0
title          0
subject        0
keywords       0
num_pages      0
text           0
is_relevant    0
dtype: int64

In [11]:
harvest_nlp

Unnamed: 0,author,title,subject,keywords,num_pages,text,is_relevant
0,ECDC,Effectiveness and cost-effectiveness of antena...,Unknown,Unknown,37,TECHNICAL REPORT\nEffectiveness and cost- \n...,0
1,Unknown,Unknown,Unknown,Unknown,45,SPECIAL REPORT\nImplementing the \nDublin Dec...,0
2,Unknown,Unknown,Unknown,Unknown,15,SPECIAL REPORT\nThematic report: \nSex worker...,0
3,Unknown,Unknown,Unknown,Unknown,36,TECHNICAL REPORT\nRisk assessment on HIV in G...,0
4,ECDC,Chlamydia - Annual epidemiological report,"Chlamydia, surveillance, monitoring","Chlamydia, surveillance, monitoring",9,\nSuggested citation: European Centre for Dis...,0
...,...,...,...,...,...,...,...
67,ECDC,Dublin Declaration evidence brief: HIV testing...,"Dublin Declaration, HIV testing",HIV testing; Dublin Declaration; partnership t...,11,Suggested citation: HIV testing in Europe and ...,1
68,Fabrice Donguy,AER_template.docx,Unknown,Unknown,8,\n \n \nSuggested citation: European Centre f...,1
69,Rumila Edward,2024-WCP-0017 Draft.docx,Not specified,Communicable disease threats report,13,\n \nEuropean Centre for Disease Prevention a...,1
70,ECDC,Continuum of HIV care: 2022 progress report,HIV,HIV; continuum of HIV care; continuum of care;...,69,SPECIAL REPORT\nContinuum of HIV care\n \nMoni...,1


## Preprocessing/Cleaning

In [16]:
STOPWORDS = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def clean_text(text):

    text = text.lower()

    patterns = {
        ### UNDERSTAND CONTEXT FIRST BEFORE CHANGING, HELPS TO MAKE MORE INFORMED DECISIONS
        #remove \n and \r and \t
        r'\n|\r|\t': ' ',

        #replaces 4-digit numbers starting with 1 or 2 with "year"
        r'\b[12]\d{3}\b':'year',
        
        }
    for pattern, replacement in patterns.items():
        text = re.sub(pattern, replacement, text)

    # remove punctuations
    puncs = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for p in puncs:
        text = text.replace(p,'') 
    
    # remove stopwords and make sure word length is greater than 1
    text = [word.lower() for word in text.split() if word.lower() not in STOPWORDS and len(word)>1]

    text = [lemmatizer.lemmatize(word) for word in text]

    text = " ".join(text) 

    return text

harvest_nlp['text_modified'] = harvest_nlp['text'].apply(lambda x: clean_text(x))

In [17]:
from collections import Counter

all_text = ''.join(harvest_nlp['text_modified'])
all_text_tokenized = [word for word in all_text.split()]
word_counts = Counter(all_text_tokenized)

In [18]:
harvest_nlp['text_modified'][0]

'technical report effectiveness cost effectiveness antenatal screening hiv hepatitis syphilis rubella susceptibility literature review wwwecdceuropaeu ecdc technical report effectiveness cost effectiveness antenatal screening hiv hepatitis syphilis rubella susceptibility literature review ii report commissioned european centre disease prevention control ecdc coordinated otilia mårdh tarik derrough andrew amato gauci report produced contract ecdc2012052 national institute health welfare hl carita savolainen kopra mia kontio marjukka mäkelä kirsi liitsola jukka lindeman jaana isojärvi heljä marja surcel irja davidkin henrikki brummer korvenkontio eija hiltunen back hanna nohynek tuija leino markku kuusi mika salminen helena de carvalho gomes ana belen escriva acknowledged internal ecdc support suggested citation european centre disease prevention control effectiveness cost effectiveness antenatal screening hiv hepatitis syphilis rubella susceptibility stockholm ecdc 2017 stockholm march 

In [21]:
word_counts

Counter({'hiv': 15977,
         'country': 11649,
         'data': 6679,
         '00': 5573,
         'report': 5099,
         'reported': 4824,
         'case': 4685,
         'testing': 4402,
         'among': 3946,
         'sex': 3914,
         'population': 3623,
         'health': 3567,
         'europe': 3551,
         'migrant': 3534,
         '10': 3378,
         'european': 3349,
         'number': 3320,
         'men': 3303,
         'infection': 3076,
         'surveillance': 3074,
         'total': 2955,
         'people': 2851,
         'prevention': 2833,
         'aid': 2723,
         '11': 2722,
         '12': 2689,
         'rate': 2566,
         'reporting': 2564,
         'centre': 2538,
         '15': 2528,
         'year': 2498,
         'drug': 2488,
         'national': 2463,
         'eueea': 2407,
         'treatment': 2398,
         '2008': 2295,
         'region': 2294,
         '13': 2282,
         'study': 2271,
         'msm': 2249,
         '14': 2192,


In [38]:
pattern = r'11'
match = harvest_nlp['text_modified'].str.findall(pattern)
match

0     [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
1              [11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
2     [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
3     [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
4     [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
                            ...                        
67     [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
68    [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
69                                 [11, 11, 11, 11, 11]
70    [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
71    [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...
Name: text_modified, Length: 72, dtype: object

In [36]:
hi = []
for description in harvest_nlp['text_modified']:
	found = re.findall(pattern,description)
	if (len(found)!=0):
		hi.append(found)
hi = sum(hi,[])
hi

['2017',
 '2017',
 '2011',
 '2014',
 '2011',
 '2011',
 '1990',
 '2011',
 '2012',
 '2012',
 '2014',
 '2000',
 '2014',
 '2011',
 '2012',
 '2012',
 '2014',
 '2014',
 '1982',
 '2003',
 '1987',
 '1985',
 '1993',
 '1994',
 '1998',
 '1999',
 '2003',
 '2000',
 '2005',
 '2004',
 '1999',
 '2006',
 '1991',
 '2002',
 '2001',
 '1988',
 '2008',
 '1997',
 '2002',
 '1997',
 '2008',
 '1996',
 '2000',
 '2002',
 '2005',
 '1997',
 '2000',
 '2001',
 '2000',
 '2008',
 '2004',
 '2004',
 '2004',
 '1999',
 '2003',
 '2003',
 '2003',
 '2000',
 '2004',
 '2000',
 '1996',
 '2000',
 '1996',
 '1987',
 '2002',
 '1996',
 '1996',
 '2002',
 '1987',
 '1995',
 '1996',
 '2002',
 '1999',
 '1999',
 '2010',
 '2002',
 '2004',
 '2002',
 '2004',
 '2005',
 '1984',
 '2007',
 '1984',
 '1986',
 '1996',
 '1998',
 '2002',
 '2004',
 '1997',
 '2004',
 '2001',
 '2006',
 '2008',
 '2004',
 '2008',
 '2004',
 '2003',
 '1425',
 '1998',
 '2001',
 '2008',
 '2009',
 '2005',
 '1989',
 '1993',
 '1998',
 '2008',
 '1997',
 '1996',
 '1993',
 '2006',
 

## Modeling

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


X_train, X_test , y_train, y_test = train_test_split(harvest_nlp['text_modified'].values,harvest_nlp['is_relevant'].values,
													 test_size=0.25,
													 random_state=42,
													 stratify=harvest_nlp['is_relevant'].values)

In [99]:
tfidf_vectorizer = TfidfVectorizer(max_features=3000) #play around with this
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

In [101]:
tfidf_train_vectors.shape

(54, 3000)

In [102]:
tfidf_train_vectors

<54x3000 sparse matrix of type '<class 'numpy.float64'>'
	with 53248 stored elements in Compressed Sparse Row format>

In [103]:
classifier = LogisticRegression()
classifier.fit(tfidf_train_vectors,y_train)
y_pred = classifier.predict(tfidf_test_vectors)

In [104]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94        16
           1       0.00      0.00      0.00         2

    accuracy                           0.89        18
   macro avg       0.44      0.50      0.47        18
weighted avg       0.79      0.89      0.84        18



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
