In [1]:
import requests
from lxml import html
import pandas as pd
from datetime import date
import re

In [17]:
words_to_look = [
        'salud',
        'farmacoeconomía',
        'medicamentos',
        'health',
        'pharmacoeconomics',
        'medicines',
        'saude',
        'farmacoeconomia',
        'medicamentos',
    ]

today = date.today().strftime("%d/%m/%Y")
source = 'unops'
file_name = './unops_ops.csv'
main_url = 'https://jobs.unops.org/Pages/ViewVacancy/VAListing.aspx'

### Generacion de csv

In [3]:
df = pd.DataFrame(columns=['url_detail_id','scrapped_day','title','opening_date',
                           'closing_date','source','is_alert','location'])

In [4]:
df.to_csv(file_name, index=False, encoding='utf-8', header=True)

In [5]:
del df

### probando

In [6]:
#lee la tabla de la pagina
df = pd.read_csv(file_name, encoding='utf-8')

In [7]:
df

Unnamed: 0,url_detail_id,scrapped_day,title,opening_date,closing_date,source,is_alert,location


In [8]:
def get_page(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)
    return tree

In [9]:
def get_by_xpath_and_clean(tree, xpath, i=0):
    """"""
    try:
        if i=='join':
            text = tree.xpath(xpath)
            text = re.sub(r'[\n\t\xa0]', '', ' '.join(text)).strip()
            return text
        else:
            text = tree.xpath(xpath)[i]
            text = re.sub(r'[\n\t\xa0]', '', text).strip()
            return text
    except:
        return None

In [18]:

response = get_page(main_url)

In [28]:
# get a list of jobs except the header
jobs = response.xpath('//a[contains(@href,"id")]/@href')
len(jobs)

15

In [29]:
jobs

['VADetails.aspx?id=25673',
 'VADetails.aspx?id=24682',
 'VADetails.aspx?id=24689',
 'VADetails.aspx?id=25670',
 'VADetails.aspx?id=25659',
 'VADetails.aspx?id=25608',
 'VADetails.aspx?id=25658',
 'VADetails.aspx?id=25674',
 'VADetails.aspx?id=25676',
 'VADetails.aspx?id=25677',
 'VADetails.aspx?id=25679',
 'VADetails.aspx?id=25667',
 'VADetails.aspx?id=25639',
 'VADetails.aspx?id=25619',
 'VADetails.aspx?id=25421']

In [31]:
detail_url = 'https://jobs.unops.org/Pages/ViewVacancy/'+ jobs[0]
detail_url

'https://jobs.unops.org/Pages/ViewVacancy/VADetails.aspx?id=25673'

In [49]:
get_by_xpath_and_clean(job, './@data-noticeid')

'192182'

In [53]:
for i, job in enumerate(jobs):
    #get the url of detail
    detail_url = main_url + '/' + get_by_xpath_and_clean(job,'./@data-noticeid')
    
    #looks if the detail url is already in the dataset
    if df['url_detail_id'][df['url_detail_id']==detail_url].any():
        print('this job is already in the dataset')
        continue

    #if not  exist, get the detail
    else:
        print('nueva oportunidad encontrada')
        #type RFQ
        reference = get_by_xpath_and_clean(job, './td[2]/text()')
        #get the title
        title = get_by_xpath_and_clean(job, './/span[@class="ungm-title ungm-title--small"]/text()')
        #type of opportunity
        type_of_opportunity = get_by_xpath_and_clean(job, 
                    './/div[@data-description="Deadline"]/following-sibling::div[3]/span/label/text()')
        
        #get the location (type RFQ)
        location = get_by_xpath_and_clean(job, 
                        './/div[@data-description="Deadline"]/following-sibling::div[5]/span/text()')
        #Opening Date:
        opening_date = get_by_xpath_and_clean(job, 
                            './/div[@data-description="Deadline"]/following-sibling::div/span/text()')
        #Closing Date
        closing_date = get_by_xpath_and_clean(job, './/div[@data-description="Deadline"]/span/text()')
        
        #find the body of the job and look for the words_to_look to appear once at least
        is_alert = False

        text_for_alert = title.strip().lower()
        if any(word in text_for_alert for word in words_to_look):
            is_alert = True
        #add the new job to the dataset
        df = df.append({'url_detail_id': detail_url, 'scrapped_day': today,  'title': title, 
                'opening_date': opening_date, 'closing_date': closing_date,'location': location,
                'is_alert': is_alert, 'source': source}, ignore_index=True)

nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada
nueva oportunidad encontrada


In [70]:
df.tail()

Unnamed: 0,url_detail_id,scrapped_day,title,opening_date,closing_date,source,is_alert,location
25,https://www.ungm.org/Public/Notice/195176,18/03/2023,SDC-003-2023 Suministro de detectores acústico...,18-Mar-2023,31-Mar-2023 18:00 (GMT -5.00),ungm,False,Mexico
26,https://www.ungm.org/Public/Notice/195178,18/03/2023,Cash for Work Through the Construction of Inta...,18-Mar-2023,02-Apr-2023 07:00 (GMT -5.00),ungm,False,Afghanistan
27,https://www.ungm.org/Public/Notice/195179,18/03/2023,CONSULTANCY - TO DEVELOP POLICIES ON CHILD PRO...,18-Mar-2023,02-May-2023 10:00 (GMT -5.00),ungm,False,Zimbabwe
28,https://www.ungm.org/Public/Notice/195180,18/03/2023,RFQ-122-23 Lot 2: Furnishing of Tikrit Univers...,18-Mar-2023,04-Apr-2023 00:00 (GMT 3.00),ungm,False,Iraq
29,https://www.ungm.org/Public/Notice/195177,18/03/2023,Invitación a Licitar (ITB) Ref FAONI -06/2023-...,17-Mar-2023,30-Mar-2023 17:00 (GMT -6.00),ungm,False,Nicaragua
