# Import Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import datetime
import csv
import openpyxl
from openpyxl import Workbook
import re

# Create Functions

## Scrape info

In [2]:
def NiproScraper(year,month,day):
    #prepare the input date
    year=str(year)
    month=str(month)
    day=str(day)
    input_date=year+'年'+month+'月'+day+'日'

    #open chrome in incognito mode
    options = webdriver.ChromeOptions()
    options.add_argument(' -- incognito')
    browser = webdriver.Chrome(chrome_options=options)

    # deal with the first "medical staff?" question
    browser.get('http://med.nipro.co.jp/index')
    timeout = 10
    try:
        WebDriverWait(browser, timeout).until(
        EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="j_id0:j_id16"]/ul/li[1]/a')
        )
        )
    except TimeoutException:
        print('Timed Out Waiting for page to load')
        browser.quit()
    login_bt=browser.find_element_by_xpath('//*[@id="j_id0:j_id16"]/ul/li[1]/a')
    login_bt.click()
    browser.implicitly_wait(3)

    # activate medical news button
    activate_bt=browser.find_element_by_xpath('/html/body/div[1]/div[4]/div[1]/div[1]/div[2]/ul/li[2]/span')
    browser.execute_script('arguments[0].click();', activate_bt)

    # Get info
    # Go to production info section
    top_info = browser.find_element_by_xpath('//*[@id="j_id0:j_id83"]/dl')

    # Go to the list
    date_list=top_info.find_elements_by_css_selector('dt')
    news_list=top_info.find_elements_by_css_selector('dd')
 
    # Go through the list
    result=[]
    date_count=0
    for dates in date_list:
        # Get URL and title if date == input date
        dates_text=dates.text
        date = re.search('\d{4}年\d{1,}月\d{1,}日', dates_text).group()
        if date==input_date:
            # format without anything in between the date
            date = str(year)+str(month)+str(day)

            # Get link and title 
            news_count=0
            for news in news_list:
                if date_count==news_count:
                    link=news.find_element_by_css_selector('a').get_attribute('href')
                    title=news.find_element_by_tag_name('a').text
                    # If the title contains "新発売", them return 1 as new_product
                    new_product_condition='新発売'
                    if new_product_condition in title:
                        new_product=1
                    else:
                        new_product=0
                    # Append the info to the list
                    result.append([date,title,link,new_product])
                    break
                else:
                    news_count+=1
            date_count+=1
    return result

## store info to csv

In [3]:
def Nipro_to_csv(result):

    # check if the result is empty
    result_len = len(result)
    if result_len == 0:
        return
    
    # get date for checking their existence later
    date=result[0][0]

    # get row number
    # try to open the csv file
    try:
        with open('Nipro.csv') as csvfile:
            reader = csv.reader(csvfile)
            # check if the title we are trying to add is already there    
            for row in reader:
                # the date is already there, dont add anything
                if row[0]==date:
                    return print('Already added to csv')
    # if there's no such file, create a new file 
    except FileNotFoundError:
        # set the header
        with open('Nipro.csv','w') as file:
            header=['日付',
                    'カテゴリコード',
                    'メーカーコード',
                    'メーカー名称',
                    '新着記事カテゴリ',
                    '新着記事タイトル',
                    '新着記事URL',
                    '新製品記事'
                    ]
            writer = csv.writer(file)
            writer.writerow(header)
    
    # add new data
    with open('Nipro.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        for i in range(result_len):
            writer.writerow([result[i][0], 
                            '', 
                            '', 
                            'ニプロ', 
                            '',
                            result[i][1], 
                            result[i][2], 
                            result[i][3]])

## store info to excel

In [4]:
def Nipro_to_excel(result):
    # check if the result is empty
    result_len = len(result)
    if result_len == 0:
        return

    # get date for checking their existence later
    date=result[0][0]

    # try to open the workbook
    try:
        wb = openpyxl.load_workbook('Nipro.xlsx')
        ws = wb['Sheet1']
        for row in ws.iter_rows(values_only=True):
            # the date is already there, dont add anything
            if row[0]==date:
                return print('Already added to excel')
    # if we cannot open it, we create a new one
    except FileNotFoundError:
        wb = Workbook()
        ws = wb.create_sheet('Sheet1')
        ws.append(['日付',
                'カテゴリコード',
                'メーカーコード',
                'メーカー名称',
                '新着記事カテゴリ',
                '新着記事タイトル',
                '新着記事URL',
                '新製品記事'])


    # check the last row in excel
    last_row = ws.max_row

    # update excel
    # can handle up to 3 news on the same day
    for i in range(result_len):
        ws.cell(row = last_row + i + 1, column = 1, value = result[i][0]) # 日付

        # add カテゴリコード、メーカーコード

        ws.cell(row = last_row + i + 1, column = 4, value = 'ニプロ') # メーカー名称

        # add 新着記事カテゴリ
        
        ws.cell(row = last_row + i + 1, column = 6, value = result[i][1]) # 新着記事タイトル
        ws.cell(row = last_row + i + 1, column = 7, value = result[i][2]) # 新着記事URL
        ws.cell(row = last_row + i + 1, column = 8, value = result[i][3]) # 新製品記事

    wb.save('Nipro.xlsx')

In [5]:
if __name__=='__main__':
    year=2020
    month=8
    day=19
    result=NiproScraper(year,month,day)
    print(result)
    Nipro_to_excel(result)
    Nipro_to_csv(result)

WebDriverException: Message: unknown error: cannot determine loading status
from unknown error: cannot determine loading status
from disconnected: Unable to receive message from renderer
  (Session info: chrome=85.0.4183.83)
