In [None]:
# https://www.rite.or.jp/en/news/#scenario

In [1]:
import json
import os 
import requests
import time
from datetime import datetime,timedelta
from datetime import date
import re
import sys
import urllib, urllib.request, urllib.parse
import random
from scrawl import *
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.firefox.options import Options

# Date and time
start_time = time.time()
current_time = datetime.now().strftime("%H-%M-%S")
created_on = date.today().strftime("%Y-%m-%d")

# client_id = sys.argv[1]
client_id = '5f69d22ef472d6646f577fa6'  # Europe
site = 'rite_or_jp_publication'
site_name = 'Research Institute of Innovative Technology for the Earth (RITE) (Japan)'
c = Crawl() 
# create directories to store logs.
log_path = c.create_directories(project_path, client_id, site)

# create image directories
image_directory = c.create_image_directories(project_path)

# creating pdf directories
pdf_directory = c.create_pdf_directories(project_path, site)

# logger
logger = log_func(log_path, created_on, current_time)
logger.info("Process Started ...\n")


# Driver Connection      
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# browser = webdriver.Chrome(f'{project_path}/chromedriver', options = options )
# browser.maximize_window()

# Driver Connection  

options = Options()
options.headless = True
browser = webdriver.Firefox(options=options)
browser.maximize_window()

# initialize variables
skipped_due_to_headline = 0
skipped_due_to_content = 0
skipped_due_to_date = 0
missing_overall_tonality = 0
no_of_data = 0
duplicate_data = 0  
unable_to_fetch_article_url = 0
unable_to_download_pdf = 0
publish_source = 'rite.or.jp'
country = 'Japan'
language = 'English'
images_path = []
foot_fall = c.get_foot_fall(publish_source)
home_url = 'https://www.rite.or.jp/en/news/'
try:
    browser.get(home_url )
except:
    logger.info('Unable to fetch home page\n')
    broswer.quit()
    exit()
time.sleep(3)
home_page = browser.page_source

home_page = c.scrap('<tbody>(.*?)</tbody>',home_page)

for i in home_page.split('<tr>')[1:-1]:
    
    # source_link
    source_link = c.scrap('href="(.*?)"', i) 
    if 'system/en/members/' in source_link:
        continue
 
    # handle duplicates
    source_link_query = {'source_link':source_link}
    dic = cl_data.find_one(source_link_query,{'source_link': 1}) 
    if dic:
        duplicate_data += 1
        continue

    time.sleep(random.randint(1,3))

    logger.info(f'Fetching {source_link}\n')

    page = c.download_page(source_link)  
   
    if page.startswith('Unable to fetch'):
        logger.info(page)
        unable_to_fetch_article_url += 1
        continue    
    source_headline = c.scrap('<title>(.*?)\|', page) 
    if not source_headline:
        source_headline = c.scrap('class="title">.*?<a.*?>(.*?)<', i) 
    source_headline = re.sub('&quot;','"',source_headline)
    source_headline = re.sub('&.*?;','',source_headline)

    # skip if headline not found
    if not source_headline:
        logger.info(f'Skipping due to headline {source_link}\n')
        skipped_due_to_headline += 1
        continue

     # Date and time
    pub_date, publish_time = '', current_time 

    try: 
        date_time_str = c.scrap('<td\s*class="date">(.*?)</td>', i) 
        date_time_str = re.sub('&.*?;','',date_time_str)
        date_time_str = re.sub('[^\w+]', '', date_time_str)  
        date_time_obj = datetime.strptime(date_time_str, '%b%d%Y')
        ist_date_time = date_time_obj - timedelta(hours = 0,minutes = 0)  
        ist_date_time = ist_date_time.strftime('%Y-%m-%d %H:%M:%S')
        pub_date = ist_date_time[:10]
        publish_time = ist_date_time[11:]

    except:
        pass

    # skip null date
    if not pub_date:
        logger.info(f'Skipping due to date {source_link}\n')            
        skipped_due_to_date += 1
        continue

    # break if date is not today's date
#     if pub_date != created_on:
#         break    

    # source_content    #<div class="boxDocTit">(.*?) 
    source_content = c.scrap('<div\s*class="boxDocTit">(.*?)<div\s*id="ftPagetop">',page)
    if not source_content:
        source_content = c.scrap('<p\s*class="txt.*?>(.*?)<p\s*id="pagetop">',page)
    if not source_content:
        source_content = c.scrap('<h3\s*class=.*?>(.*?)<div\s*id="side">',page)
    if not source_content:
        source_content = c.scrap('<div\s*class="boxDocTit">(.*?)<div\s*id="ftPagetop">',page)
    if not source_content:
        source_content = c.scrap('<p\s*class="txt.*?>(.*?)<p\s*class="boxPagesEntry">',page)
    if not source_content:
        source_content = source_headline
    source_content = re.sub('<div\s*class="text-muted\s*text-small">(.*?)</div>','',source_content,re.S)
    source_content = re.sub('Download','',source_content,re.S)
    source_content = re.sub('<i\s*class="icon-salient-heart-2">(.*?)</a></span>','',source_content,re.S)
    source_content = re.sub('&.*?;','',source_content,re.S)
    source_content = c.strip_html(source_content)

    if not source_content:
        logger.info(f'Skipping due to content {source_link}\n')            
        skipped_due_to_content += 1
        continue

    journalist =c.scrap("'author-name':'(.*?)'",page)

    if not journalist: journalist = 'NA'

    # current date and time 00
    harvest_time = datetime.now().strftime("%H:%M:%S")

    # temp link
    temp_link = source_link

    # headline and content 
    headline = source_headline
    content = source_content

    # overall_tonality
    overall_tonality = ''

    # word count
    word_count = len((source_headline + ' ' + source_content).split())

    html_content = ''

    # image_urls
    image_urls = []
    pdf_name , pdf_path = '' , ''
    for i in page.split('href=')[1:]:
        pdf_url = c.scrap('"(.*?)"',i)
        if pdf_url.endswith('.pdf'):
            pdf_name = c.scrap('.*\/(.*)',pdf_url)
            pdf_path = f'{pdf_directory}/{pdf_name}'

           # download pdf
            pdf = c.download_pdf(pdf_url, pdf_path)
            if pdf.startswith('Unable to fetch'):
                logger.info(pdf) # writes error message with error code
                unable_to_download_pdf += 1
                continue
  
    # storing the above data in a dictionary
    clientdata ={
                    "client_master" : client_id, 
                    "articleid":client_id,
                    "medium":'Web' ,
                    "searchkeyword":[],
                    "entityname" : [] ,
                    "process_flage":"1",
                    "na_flage":"0",
                    "na_reason":"",
                    "qc_by":"",
                    "qc_on":"",
                    "location":"",
                    "spokeperson":"",
                    "quota":"",
                    "overall_topics":"",
                    "person":"",
                    "overall_entites":"",
                    "overall_tonality": overall_tonality,
                    "overall_wordcount":word_count,
                    "article_subjectivity":"",
                    "article_summary":"",
                    "pub_date":pub_date,
                    "publish_time":publish_time,
                    "harvest_time":harvest_time,
                    "temp_link":temp_link,
                    "publish_source": publish_source,
                    "programme":'null',
                    "feed_class":"News",
                    "publishing_platform":"",
                    "klout_score":"",
                    "journalist":journalist,
                    "headline":headline,
                    "content":content,
                    "source_headline":source_headline,
                    "source_content":source_content,
                    "language":language,
                    "presence":'null',
                    "clip_type":'null',
                    "prog_slot":'null',
                    "op_ed":'0',
                    "location_mention":'',
                    "source_link":source_link,
                    "author_contact":'',
                    "author_emailid":'',
                    "author_url":'',
                    "city":'',
                    "state":'',
                    "country":country,
                    "source":publish_source,
                    "foot_fall":foot_fall,
                    "created_on":created_on,
                    "active":'1',
                    'crawl_flag':2,
                    "images_path":images_path,
                    "html_content":html_content,
                    "pdf_url": pdf_url,
                    "pdf_name": pdf_name,
                    "pdf_path":pdf_path
                }

    cl_data.insert_one(clientdata)  
    no_of_data += 1
logger.info('Iteration complete\n')   
logger.info(f'Number of data: {no_of_data}\n')
logger.info(f'Duplicate data: {duplicate_data}\n')
logger.info(f'Unable to fetch article url: {unable_to_fetch_article_url}\n')
logger.info(f'Skipped due to headline: {skipped_due_to_headline}\n')
logger.info(f'Skipped due to content: {skipped_due_to_content}\n')
logger.info(f'Unable to download pdf: {unable_to_download_pdf}\n')
logger.info(f'Skipped due to date: {skipped_due_to_date}\n')
logger.info(f'country: {country}\n')
logger.info(f'language: {language}\n')
logger.info(f'Processing finished in {time.time() - start_time} seconds.\n') 

https://www.rite.or.jp/bio/en/naist/
http://www.rite.or.jp/system/en/system-group-papers/
https://www.rite.or.jp/system/en/latestanalysis/2021/06/2050carbonneutrality.html
https://www.rite.or.jp/chemical/en/works/
http://www.rite.or.jp/system/en/system-group-papers/
https://www.rite.or.jp/en/results/today/2021/
https://www.rite.or.jp/bio/en/works/


In [None]:
522839469