In [1]:
import requests
from lxml import html
import pandas as pd
from datetime import date
import re

In [3]:
words_to_look = [
    'salud',
    'farmacoeconomía',
    'medicamentos',
    'health',
    'pharmacoeconomics',
    'medicines',
    'santé', 
    'pharmacoéconomie',
    'médicaments',
    'saude',
    'farmacoeconomia',
    'medicamentos'
    ]

today = date.today().strftime("%d/%m/%Y")
source = 'wbg'
main_url = 'https://www.worldbank.org/en/about/corporate-procurement/business-opportunities/administrative-procurement'

### Generacion de csv

In [28]:
df = pd.DataFrame(columns=['url_detail_id','scrapped_day','title','opening_date',
                           'closing_date','source','is_alert','location'])

In [29]:
df.to_csv('./oportunidades_wbg.csv', index=False, encoding='utf-8', header=True)

In [5]:
del df

### probando

In [4]:
def get_page(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    return tree

In [5]:
def get_by_xpath_and_clean(tree, xpath, i=0):
    """"""
    try:
        if i=='join':
            text = tree.xpath(xpath)
            text = re.sub(r'[\n\t\xa0]', '', ' '.join(text)).strip()
            return text
        else:
            text = tree.xpath(xpath)[i]
            text = re.sub(r'[\n\t\xa0]', '', text).strip()
            return text
    except:
        return None

In [6]:

response = get_page(main_url)

In [7]:
#get the today date in format "15-Mar-2023"
today = date.today().strftime("%d-%b-%Y")
today

'23-Mar-2023'

In [8]:
jobs = response.xpath('//table/tbody/tr')
len(jobs)

45

In [9]:
job = jobs[0]

In [10]:
get_by_xpath_and_clean(job,'./td/a/@href')

'/en/about/corporate-procurement/business-opportunities/administrative-procurement/rfp-23-0348-cleaning-services-for-ibrd-office-in-brussels-belgium'

In [11]:
#title
get_by_xpath_and_clean(job,'./td/a/text()')

'RFP 23-0348 CLEANING SERVICES FOR IBRD OFFICE IN BRUSSELS, BELGIUM'

In [12]:
#ref
get_by_xpath_and_clean(job,'./td[2]/text()')

'RFP 23-0348'

In [13]:
#opening date
get_by_xpath_and_clean(job,'./td[3]/text()')

'February 28,2023'

In [14]:
#closing date
get_by_xpath_and_clean(job,'./td[4]/text()')

'March 27,2023'

In [20]:
detail_url = main_url + "/" + get_by_xpath_and_clean(job,'./td/a/@href').rsplit('/', 1)[1]
detail_url

'https://www.worldbank.org/en/about/corporate-procurement/business-opportunities/administrative-procurement/rfp-23-0348-cleaning-services-for-ibrd-office-in-brussels-belgium'

In [21]:
detail_page = get_page(detail_url)

In [23]:
title = get_by_xpath_and_clean(job,'./td/a/text()')
title

'RFP 23-0348 CLEANING SERVICES FOR IBRD OFFICE IN BRUSSELS, BELGIUM'

In [24]:
text_for_alert =  ( title + get_by_xpath_and_clean(detail_page, 
                                        '//div[@class="c14v1-body-text"]/descendant::text()', i='join')\
                        ).strip().lower()

In [25]:
text_for_alert

"rfp 23-0348 cleaning services for ibrd office in brussels, belgiumenter here the text that needs to be part of your advertisement. you may use the following text: \r the solicitation may be obtained by sending an email to the designated category manager, referencing the following information: \r 1. solicitation number \r 2. company name \r 3. contact person name and title \r 4. address \r 5. telephone number \r 6. fax number \r 7. contact's email address \r a copy of the solicitation will be sent to organizations that have replied to and are eligible to receive this advertisement. all requests and questions regarding this solicitation shall be directed to the following designated corporate procurement category manager: \r stefan kotupov at skotupov@worldbank.org \r the wbg invites qualified firms to submit technical and financial proposals to provide services associated with the world bank group’s requirements. \r firms must meet the following requirements to be eligible for the techn

In [53]:
for i, job in enumerate(jobs):
    #get the url of detail
    detail_url = main_url + "/" + get_by_xpath_and_clean(job,'./td/a/@href').rsplit('/', 1)[1]
    
    #looks if the detail url is already in the dataset
    if df['url_detail_id'][df['url_detail_id']==detail_url].any():
        print('this job is already in the dataset')
        continue

    #if not  exist, get the detail
    else:
        print('nueva oportunidad encontrada')

        detail_page = get_page(detail_url)

        #type RFQ
        reference = get_by_xpath_and_clean(job,'./td[2]/text()')
        #get the title
        title = get_by_xpath_and_clean(job,'./td/a/text()')
        #Opening Date:
        opening_date = get_by_xpath_and_clean(job,'./td[3]/text()')
        #Closing Date
        closing_date = get_by_xpath_and_clean(job,'./td[4]/text()')
        
        #find the body of the job and look for the words_to_look to appear once at least
        is_alert = False

        text_for_alert = ( title + get_by_xpath_and_clean(detail_page, 
                                        '//div[@class="c14v1-body-text"]/descendant::text()', i='join')\
                        ).strip().lower()
        if any(word in text_for_alert for word in words_to_look):
            is_alert = True
        #add the new job to the dataset
        df = df.append({'url_detail_id': detail_url, 'scrapped_day': today,  'title': title, 
                'opening_date': opening_date, 'closing_date': closing_date,
                'is_alert': is_alert, 'source': source}, ignore_index=True)

nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada


In [70]:
df.tail()

Unnamed: 0,url_detail_id,scrapped_day,title,opening_date,closing_date,source,is_alert,location
25,https://www.ungm.org/Public/Notice/195176,18/03/2023,SDC-003-2023 Suministro de detectores acústico...,18-Mar-2023,31-Mar-2023 18:00 (GMT -5.00),ungm,False,Mexico
26,https://www.ungm.org/Public/Notice/195178,18/03/2023,Cash for Work Through the Construction of Inta...,18-Mar-2023,02-Apr-2023 07:00 (GMT -5.00),ungm,False,Afghanistan
27,https://www.ungm.org/Public/Notice/195179,18/03/2023,CONSULTANCY - TO DEVELOP POLICIES ON CHILD PRO...,18-Mar-2023,02-May-2023 10:00 (GMT -5.00),ungm,False,Zimbabwe
28,https://www.ungm.org/Public/Notice/195180,18/03/2023,RFQ-122-23 Lot 2: Furnishing of Tikrit Univers...,18-Mar-2023,04-Apr-2023 00:00 (GMT 3.00),ungm,False,Iraq
29,https://www.ungm.org/Public/Notice/195177,18/03/2023,Invitación a Licitar (ITB) Ref FAONI -06/2023-...,17-Mar-2023,30-Mar-2023 17:00 (GMT -6.00),ungm,False,Nicaragua
