# Selenium for 법무법인_태평양_법률자료

### Import Files

In [1]:
import requests
import lxml.html
import sqlite3 as sq3
from pandas.io import sql
import os
import re
import string
import pandas as pd
from tabulate import tabulate
from selenium.webdriver import Chrome
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from selenium import webdriver
from selenium.webdriver import ActionChains
import csv

## get_urls()
- gets the list of urls to detail pages

In [2]:
def get_urls():
    
    page = 1
    url_list = []
    
    response = requests.get('https://www.bkl.co.kr/law/insight/legalDataList?searchCondition=&searchKeyword=&searchDateFrom=&searchDateTo=&orderBy=orderByNew&pageIndex={}&whichOne=NEWSLETTER&menuType=law&lawNo=&expertNo=&newsletterNo=&memberNo=&fieldNo=&lang=ko'.format(page))
    root = lxml.html.fromstring(response.content)

    # 페이지 번호 중에서 max 페이지 가져오기    
    a = root.xpath('//*[@id="searchForm"]/div/div[3]/div[4]/div/div/div[4]/a')[0]
    onclick = a.get('onclick')
    max_page = int(onclick.split("'")[1])
    
    while(True):
        
        response = requests.get('https://www.bkl.co.kr/law/insight/legalDataList?searchCondition=&searchKeyword=&searchDateFrom=&searchDateTo=&orderBy=orderByNew&pageIndex={}&whichOne=NEWSLETTER&menuType=law&lawNo=&expertNo=&newsletterNo=&memberNo=&fieldNo=&lang=ko'.format(page))
        root = lxml.html.fromstring(response.content)
        for li in root.xpath('//*[@id="searchForm"]/div/div[3]/div[3]/ul/li') :
            a = li.xpath('div/a')[0]
            url_1 = a.get('href')
            url_2 = int(url_1.split("'")[1])
            url_3 = 'https://www.bkl.co.kr/law/insight/newsletter/detail?searchCondition=&searchKeyword=&searchDateFrom=&searchDateTo=&orderBy=orderByNew&pageIndex=1&whichOne=NEWSLETTER&menuType=law&lawNo=&expertNo=&newsletterNo={}&memberNo=&fieldNo=&lang=ko'.format(url_2)
            url_list.append(url_3)

        if (page >= max_page):
            break
        else:
            page = page + 1

        time.sleep(1)
        
    return url_list

## Save URL list into a file

In [3]:
def saveURLs(url_list, file_name):
    # open file in write mode
    with open(file_name, 'w') as fp:
        for url in url_list:
            # write each item on a new line
            fp.write("%s\n" % url)
        print('URLs saved!')

In [4]:
def readURLs(file_name):
    # empty list to read list from a file
    urls = []
    # open file and read the content in a list
    with open(file_name, 'r') as fp:
        for line in fp:
            # remove linebreak from a current name
            # linebreak is the last character of each line
            x = line[:-1]

            # add current item to the list
            urls.append(x)
    return(urls)

## remove_punc()
- removes punctuations from string

In [5]:
def remove_punc(data):

    punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·「」△《》•‘’○※▷【】『』·-“”)I]'
    new_string = re.sub(punc, '', data) # 특수문자 제거
    new_string = re.sub('\n', ' ', new_string) # newline 제거
    new_string = re.sub('\\s+', ' ', new_string) # multiple spaces 제거
    return new_string

## db_save()
- saves data as db file

In [6]:
def db_save(ARTICLE_LIST, db_name, table_name):
    with sq3.connect(os.path.join('.',db_name)) as con: # sqlite DB 파일이 존재하지 않는 경우 파일생성
        try:
            ARTICLE_LIST.to_sql(name = table_name, con = con, index = False, if_exists='replace') 
            #if_exists : {'fail', 'replace', 'append'} default : fail
        except Exception as e:
            print(str(e))
        print(len(ARTICLE_LIST), '건 저장완료..')

## db_select()
- read data from db file

In [7]:
def db_select(db_name, table_name):
    with sq3.connect(db_name) as con: 
        try:
            query = 'SELECT * FROM {}'.format(table_name)
            df = pd.read_sql(query, con = con)
        except Exception as e:
            print(str(e)) 
        return df

In [8]:
def db_delete(db_name, table_name):
    with sq3.connect(db_name) as con: 
        try:
            cur = con.cursor()
            sql = 'DELETE FROM {}'.format(table_name)
            cur.execute(sql)
        except Exception as e:
            print(str(e))

## get_info()
- returns detailed information on the article pages as list

In [9]:
def get_info(url):
    details = []
    article = []
    topic = []
    topicString = ""
    articleString = ""
    
    browser.get(url)
    time.sleep(2)
    
    title = browser.find_element('xpath','//*[@id="listConditionVO"]/div/div[1]/div/div[2]').text
    date = browser.find_element('xpath','//*[@id="listConditionVO"]/div/div[1]/div/div[1]').text
    topics = browser.find_elements('xpath','//*[@id="listConditionVO"]/div/div[2]/div/div[2]/ul/li')
    
    # click expand button
    if (browser.find_element('xpath','//*[@id="listConditionVO"]/div/div[1]/div/button')):
        click_expand = browser.find_element('xpath','//*[@id="listConditionVO"]/div/div[1]/div/button').click()
    text_box = browser.find_element('xpath','//div[@class="txt"]')

    for eachTopic in topics:
        t1 = eachTopic.find_element('xpath','a').text
        topic.append(t1)
    topicString = ",".join(topic)
    
    child_el = text_box.find_elements('xpath','.//*')
    for eachText in child_el:
        article.append(eachText.text)
        browser.implicitly_wait(5)
    articleString = ' '.join(article)
    
    browser.implicitly_wait(5)
    
    words = remove_punc(articleString)
    details.append(title)
    details.append(date)
    details.append(topicString)
    details.append(words)
    
    return details

## db_save_as_csv()
- saves dataframe as csv

In [10]:
def db_save_as_csv(data):
    data.to_csv("법무법인_태평양_법률자료.csv", index=False, encoding='utf-8')

### ---------------------------------------------------------------------------------------------------------------------
# Main
This is the main part which uses functions to read articles and saves the data as csv file
### ---------------------------------------------------------------------------------------------------------------------

In [11]:
from selenium.webdriver.chrome.options import Options

options = Options()

#바이너리 위치 설정
#options.binary_location= 'C:\Program Files\Google\Chrome\Application\chrome.exe'

#urls = get_urls()
df_list = []
URL_FILE = "url_list.txt"
TABLE_NAME = "태평양"

#saveURLs(urls, URL_FILE)          # save urls to txt file
url_list = readURLs(URL_FILE)     # read urls from txt file

browser = webdriver.Chrome('C:/Users/user/.wdm/drivers/chromedriver/win32/105.0.5195/chromedriver.exe', chrome_options = options) #드라이버경로 지정

for url in url_list:
    details = get_info(url)
    
    df = pd.DataFrame([{
        "제목": details[0],
        "날짜": details[1],
        "관련주제": details[2],
        "본문내용": details[3]
    }])
    
    df_list.append(df)
    
    browser.implicitly_wait(4)

ARTICLE_LIST = pd.concat(df_list)
db_save(ARTICLE_LIST, '법무법인_태평양_법률자료.db',TABLE_NAME)
db_save_as_csv(ARTICLE_LIST)

browser.close()
browser.quit()

  browser = webdriver.Chrome('C:/Users/user/.wdm/drivers/chromedriver/win32/105.0.5195/chromedriver.exe', chrome_options = options) #드라이버경로 지정
  browser = webdriver.Chrome('C:/Users/user/.wdm/drivers/chromedriver/win32/105.0.5195/chromedriver.exe', chrome_options = options) #드라이버경로 지정


405 건 저장완료..


In [12]:
df1 = db_select('법무법인_태평양_법률자료.db', TABLE_NAME)
df1

Unnamed: 0,제목,날짜,관련주제,본문내용
0,한국-인도네시아 포괄적 경제동반자협정(CEPA) 비준,2022.09.13,"기업법무,해외투자,국제쟁송,관세·국제통상,동남아시아",인도네시아 국회의 한인니 CEPA 비준 및 진행 상황 인도네시아 국회의 한인니 C...
1,지역주택조합이 조합총회 결의 없이 체결한 계약의 효력에 관한 대법원판결(대법원 20...,2022.09.06,"국내소송,건설·부동산소송,건설",주택법 시행규칙은 지역주택조합이 예산으로 정한 사항 외에 조합원에게 부담이 될 계약...
2,금융투자업자 간 업무용으로 지정되지 않은 정보통신수단을 사용하여 이루어진 매매 교섭...,2022.09.05,"자본시장,국내소송,증권금융소송,금융회사,은행,증권,자산운용,비은행 금융기관",배경 배경 배경 1 금융투자업자의 주문기록 유지의무 및 업무용 정보통신수단 지정 ...
3,도심복합사업 개편 – 정부의 국민 주거안정 실현방안 발표 및 ‘도심 복합개발 지원에...,2022.08.23,"국내소송,건설·부동산소송,건설",정부와 국회는 도심 내 주택을 신속히 공급하기 위해 공공주택 특별법을 개정하여 도심...
4,2022년도 지방세입 관계법률 개정안 주요 사항,2022.08.12,조세,행정안전부는 2022 8 11 지방세발전위원회를 개최하여 경제 활력 제고와 민생안정...
...,...,...,...,...
400,BKL Newsletter - Spring 2013,2013.04.10,"인사·노무,공정거래 조사·소송,국제쟁송,기업인수합병,보험,일반민사소송,자본시장,조세...",
401,BKL Legal Update - 3. 20. 방송사 전산망 마비사태와 관련하여 말...,2013.03.22,정보보호·프라이버시,
402,BKL Legal Update - 3. 20. 금융기관 전산망 마비사태와 관련하여 ...,2013.03.22,정보보호·프라이버시,
403,BKL Legal Update - 환경부의 특정수질유해물질 배출 전면조사 관련 예상...,2013.02.27,환경,


In [21]:
df1.iloc[400,3]

''

In [14]:
df1.isna().any()

제목      False
날짜      False
관련주제    False
본문내용    False
dtype: bool

In [15]:
index_no_art = df1.loc[df1['본문내용'] == ''].index.values
index_no_art

array([281, 286, 291, 301, 307, 316, 324, 333, 340, 347, 351, 359, 368,
       378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
       391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403,
       404], dtype=int64)

In [19]:
len(index_no_art)

40