In [11]:
import json
import os 
import requests
import time
from datetime import datetime,timedelta
from datetime import date
import re
import sys
import urllib, urllib.request, urllib.parse
import random
from scrawl import *
    
# Date and time
start_time = time.time()
current_time = datetime.now().strftime("%H-%M-%S")
created_on = date.today().strftime("%Y-%m-%d")

# client_id = sys.argv[1]
client_id = '5f69d22ef472d6646f577fa6'  # Europe
site = 'collections_unu_edu_publication'
site_name = 'United Nations University (UNU) (Japan)'
c = Crawl()  # creating object

# create directories to store logs.
log_path = c.create_directories(project_path, client_id, site)

## creating pdf directories
pdf_directory = c.create_pdf_directories(project_path, site)
# create image directories
image_directory = c.create_image_directories(project_path)

# logger
logger = log_func(log_path, created_on, current_time)
logger.info("Process Started ...\n")
current_year = date.today().strftime("%Y")   # to get the current year of the sites

# initialize variables
skipped_due_to_headline = 0
skipped_due_to_content = 0
skipped_due_to_date = 0
missing_overall_tonality = 0
no_of_data = 0
duplicate_data = 0  
unable_to_fetch_article_url = 0
unable_to_download_pdf = 0
publish_source = 'unu.edu'
country = 'Japan'
language = 'English'
images_path = []
foot_fall = c.get_foot_fall(publish_source)
home_page = c.download_page(f'http://collections.unu.edu/list/year/{current_year}/?tpl=2')

for i in home_page.split('<item>')[1:]:
    # source_link
    source_link = c.scrap('<link>(.*?)</link>', i)
    if not source_link:
        continue
    if '8127' in source_link:
        continue
  

    # handle duplicates
    source_link_query = {'source_link':source_link}
    dic = cl_data.find_one(source_link_query,{'source_link': 1}) 
    if dic:
        duplicate_data += 1
        continue

    time.sleep(random.randint(1,3))

    logger.info(f'Fetching {source_link}\n')

    page = c.download_page(source_link)   # here the page wil get download

    if page.startswith('Unable to fetch'):     
        logger.info(page) # writes error message with error code
        unable_to_fetch_article_url += 1
        continue    

    source_headline = c.scrap('<title>(.*?)</title>', page)
    source_headline = re.sub('&.*?;','',source_headline)
#     print(source_headline)
    # skip if headline not found
    if not source_headline:
        logger.info(f'Skipping due to headline {source_link}\n')
        skipped_due_to_headline += 1
        continue

     # Date and time
    pub_date, publish_time = '', ''
#     publish_time = current_time
    try:
        date_time_str = c.scrap('<pubDate>(.*?)</pubDate>', i) 
        date_time_str = re.sub('[^\w+]', '', date_time_str)  
        date_time_str = re.sub('Z', '', date_time_str)
        date_time_obj = datetime.strptime(date_time_str, '%Y%m%dT%H%M%S')
        ist_date_time = date_time_obj + timedelta(hours = 2,minutes = 30)  
        ist_date_time = ist_date_time.strftime('%Y-%m-%d %H:%M:%S')
        pub_date = ist_date_time[:10]
        publish_time = ist_date_time[11:]

    except:
        pass

    # skip null date
    if not pub_date:
        logger.info(f'Skipping due to date {source_link}\n')            
        skipped_due_to_date += 1
        continue

    # break if date is not today's date
    if pub_date != created_on:
        break    


    # source_content         
    source_content= c.scrap('<description>(.*?)</description>',i)   
    if not source_content:
        source_content = source_headline
    source_content = re.sub('<script>(.*?)</script>','',source_content)
    source_content = re.sub('&.*?;','',source_content)
    source_content = c.strip_html(source_content)  
    if not source_content:
        logger.info(f'Skipping due to content {source_link}\n')            
        skipped_due_to_content += 1
        continue


    journalist =c.scrap("<author>(.*?)</author>",i)

    if not journalist: journalist = 'NA'

    # current date and time 00
    harvest_time = datetime.now().strftime("%H:%M:%S")

    # temp link
    temp_link = source_link

    # headline and content 
    headline = source_headline
    content = source_content

    # overall_tonality
    overall_tonality = ''

    # word count
    word_count = len((source_headline + ' ' + source_content).split())

    html_content = ''

    # image_urls
    image_urls = []


    # pdf url
    pdf_name,pdf_url,pdf_path='','',''
    pdf_url = re.findall('<media:content url="(.*?)"', i)
    for i in pdf_url:
        pdf_url = i
        pdf_name = c.scrap('.*\/(.*)',pdf_url)
        pdf_path = f'{pdf_directory}/{pdf_name}'        
               # download pdf
        pdf = c.download_pdf(pdf_url, pdf_path)
        if pdf.startswith('Unable to fetch'):
            logger.info(pdf)
            unable_to_download_pdf += 1
            continue

    # storing the above data in a dictionary
    clientdata ={
                    "client_master" : client_id, 
                    "articleid":client_id,
                    "medium":'Web' ,
                    "searchkeyword":[],
                    "entityname" : [] ,
                    "process_flage":"1",
                    "na_flage":"0",
                    "na_reason":"",
                    "qc_by":"",
                    "qc_on":"",
                    "location":"",
                    "spokeperson":"",
                    "quota":"",
                    "overall_topics":"",
                    "person":"",
                    "overall_entites":"",
                    "overall_tonality": overall_tonality,
                    "overall_wordcount":word_count,
                    "article_subjectivity":"",
                    "article_summary":"",
                    "pub_date":pub_date,
                    "publish_time":publish_time,
                    "harvest_time":harvest_time,
                    "temp_link":temp_link,
                    "publish_source": publish_source,
                    "programme":'null',
                    "feed_class":"News",
                    "publishing_platform":"",
                    "klout_score":"",
                    "journalist":journalist,
                    "headline":headline,
                    "content":content,
                    "source_headline":source_headline,
                    "source_content":source_content,
                    "language":language,
                    "presence":'null',
                    "clip_type":'null',
                    "prog_slot":'null',
                    "op_ed":'0',
                    "location_mention":'',
                    "source_link":source_link,
                    "author_contact":'',
                    "author_emailid":'',
                    "author_url":'',
                    "city":'',
                    "state":'',
                    "country":country,
                    "source":publish_source,
                    "foot_fall":foot_fall,
                    "created_on":created_on,
                    "active":'1',
                    'crawl_flag':2,
                    "images_path":images_path,
                    "html_content":html_content,
                    "pdf_url": pdf_url,
                    "pdf_name": pdf_name,
                    "pdf_path":pdf_path
                }
#     cl_data.insert_one(clientdata)  
    no_of_data += 1


logger.info('Iteration complete\n')   
logger.info(f'Number of data: {no_of_data}\n')
logger.info(f'Duplicate data: {duplicate_data}\n')
logger.info(f'Unable to fetch article url: {unable_to_fetch_article_url}\n')
logger.info(f'Skipped due to headline: {skipped_due_to_headline}\n')
logger.info(f'Skipped due to content: {skipped_due_to_content}\n')
logger.info(f'Skipped due to date: {skipped_due_to_date}\n')
logger.info(f'Unable to download pdf: {unable_to_download_pdf}\n')
logger.info(f'country: {country}\n')
logger.info(f'language: {language}\n')
logger.info(f'Processing finished in {time.time() - start_time} seconds.\n')

http://collections.unu.edu/view/UNU:7956
http://collections.unu.edu/view/UNU:7985
http://collections.unu.edu/view/UNU:7930
http://collections.unu.edu/view/UNU:8009
http://collections.unu.edu/view/UNU:8039
http://collections.unu.edu/view/UNU:7960
http://collections.unu.edu/view/UNU:8075
http://collections.unu.edu/view/UNU:8128
http://collections.unu.edu/view/UNU:8090
http://collections.unu.edu/view/UNU:8210
http://collections.unu.edu/view/UNU:7958
http://collections.unu.edu/view/UNU:8126
http://collections.unu.edu/view/UNU:8008
http://collections.unu.edu/view/UNU:7977
http://collections.unu.edu/view/UNU:8083
http://collections.unu.edu/view/UNU:8154
http://collections.unu.edu/view/UNU:7934
http://collections.unu.edu/view/UNU:7967
http://collections.unu.edu/view/UNU:7936
http://collections.unu.edu/view/UNU:8033
http://collections.unu.edu/view/UNU:8036
http://collections.unu.edu/view/UNU:8040
http://collections.unu.edu/view/UNU:8065
http://collections.unu.edu/view/UNU:8091
http://collectio

# this is how with the help of findall i wil get the url of the pdf

In [1]:
import re
i = '''
<link>http://collections.unu.edu/view/UNU:8065</link>
<description>Between April 2020 and March 2021, Delta 8.7 convened global expert Working Groups to produce three Policy Guides to address “what works” to achieve Target 8.7 in three broad domains: Justice, Crisis and Markets. Delta 8.7’s Crisis Policy Guide seeks to assist policymakers in assessing “what works” to end modern slavery in the context of markets. For the purposes of this Policy Guide, Crisis represents a critical threat to basic human rights of a community or other large group of people, usually over a wide area. It requires a unified response from multiple actors, which may involve an international or cross-border response. It can include conflict and natural disasters (including pandemics). The purpose of these Policy Guides is to provide a highly credible and current articulation of what we know about the global and national policies needed to accelerate progress towards SDG Target 8.7 in a format that is useful for policy actors, providing a snapshot of “what works”. The specific audience for these Guides is multilateral and national-level policymakers. The Guides do not seek to dictate to policymakers how they should organize to achieve Target 8.7 — because that requires an understanding of the specific challenges in each country, the available resources and other contextual factors. Instead, they aim to provide an evidence-based policy resource that is useful across contexts and to policy actors around the world, including those thinking about multilateral policy frameworks.</description>
<pubDate>2021-04-28T00:36:32Z</pubDate>
<author> Bryant, Katharine og Frame, John og Tabusca, Silvia og Davis, Tina og Hynes, Patricia og Otiende, Sophie </author>
<media:content url="http://collections.unu.edu/eserv/UNU:8065/Delta87_CrisisPolicyGuide.pdf" type="application/pdf"/>
<media:content url="http://collections.unu.edu/eserv/UNU:8065/FR_Crisis_Policy_Guide.pdf" type="application/pdf"/>
<media:content url="http://collections.unu.edu/eserv/UNU:8065/SP_Crisis_Policy_Guide.pdf" type="application/pdf"/>
</item>
'''

In [2]:
url=re.findall('<media:content url="(.*?)"',i)

In [3]:
for i in url:
    print(i)

http://collections.unu.edu/eserv/UNU:8065/Delta87_CrisisPolicyGuide.pdf
http://collections.unu.edu/eserv/UNU:8065/FR_Crisis_Policy_Guide.pdf
http://collections.unu.edu/eserv/UNU:8065/SP_Crisis_Policy_Guide.pdf
