## 데이터를 CSV 파일로 저장

In [1]:
import csv

csvFile = open('test.csv', 'w', encoding='UTF-8')
try:
    writer = csv.writer(csvFile)
    writer.writerow(('number', 'number+2', '(number+2)^2'))

    for i in range(10):
        writer.writerow((i, i+2, pow(i+2, 2)))
except Exception as e:
    print(e)
finally:
    csvFile.close()

## 예제 1: 테이블 데이터를 CSV로 저장

In [2]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')
# 두 개의 테이블 중에 첫 번째 테이블 사용
table = bs.find_all('table', {'class' : 'wikitable'})[0]
rows = table.find_all('tr')

csvFile = open('editors.csv', 'wt', encoding='utf-8') # t: text mode
writer = csv.writer(csvFile)

try:
    for row in rows:
        csvRow = []
        for cell in row.find_all(['th', 'td']):
            print(cell.text.strip())
            csvRow.append(cell.text.strip())
            writer.writerow(csvRow)
finally:
    csvFile.close()

Name
Developer
Initial release
Latest release
Program­ming language
Cost (US$)
License
GUI
TUI or CLI
Version
Date
Acme
Rob Pike
1993
Plan 9 and Inferno

C
No cost
MITGPL-2.0-onlyLPL-1.02


AkelPad
Alexey KuznetsovAlexander Shengalts
2003


C
No cost
BSD-2-Clause


Alphatk
Vince Darley
1999
8.3.3[1]
2004-12-10

$40
Proprietary, with BSD components


Atom
GitHub
2014
1.63.1[2]
2022-11-23
HTML, CSS, JavaScript, C++
No cost
MIT


BBEdit
Rich Siegel
1992
15.0.3[3]
2024-04-08
Objective-C, Objective-C++
No cost for most features, $49.99 for full version
Proprietary


Bluefish
Bluefish Development Team
1999
2.2.15[4]
2024-03-17
C
No cost
GPL-3.0-or-later


Brackets
Adobe Systems
2012
2.2.1[5]
2023-03-22
HTML, CSS, JavaScript, C++
No cost
MIT


Coda
Panic
2007
2.7.7[6]
2020-11-05
Objective-C
$99
Proprietary


ConTEXT
ConTEXT Project Ltd
1999


Object Pascal (Delphi)
No cost
BSD-3-Clause


Crimson Editor
Ingyu Kang
1999
3.72-r286m[7]
2011-10-01
C++
No cost
Proprietary


CudaText
UVViewSoft[a]
2

### 테이블 데이터를 CSV로 저장: html_table_parser 사용 예제 #1

In [3]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
from html_table_parser import parser_functions as parse
import pandas as pd
import collections

html = urlopen('https://en.wikipedia.org/wiki/Comparison_of_text_editors')
bs = BeautifulSoup(html, 'html.parser')

table = bs.find('table', {'class' : 'wikitable'})
table_data = parse.make2d(table) # 2차원 리스트 형태로 변환

# 테이블의 2행을 출력
print('[0]:', table_data[0])
print('[1]:', table_data[1])

[0]: ['Name', 'Developer', 'Initial release', 'Latest release', 'Latest release', 'Program\xadming language', 'Cost (US$)', 'License', 'GUI', 'TUI or CLI']
[1]: ['Name', 'Developer', 'Initial release', 'Version', 'Date', 'Program\xadming language', 'Cost (US$)', 'License', 'GUI', 'TUI or CLI']


### 테이블 데이터를 CSV로 저장: html_table_parser 사용 예제 #1

In [9]:
# Pandas DataFrame으로 저장 (2행부터 데이터 저장, 1행은 column 이름으로 사용)
df = pd.DataFrame(table_data[2:], columns = table_data[1])
print(df.head())

# csv 파일로 저장
csvFile = open('editors1.csv', 'w', encoding='utf-8-sig') # t: text mode
writer = csv.writer(csvFile)

for row in table_data:
    writer.writerow(row)

csvFile.close()

      Name                            Developer Initial release  \
0     Acme                             Rob Pike            1993   
1  AkelPad  Alexey KuznetsovAlexander Shengalts            2003   
2  Alphatk                         Vince Darley            1999   
3     Atom                               GitHub            2014   
4   BBEdit                          Rich Siegel            1992   

              Version        Date       Program­ming language  \
0  Plan 9 and Inferno                                       C   
1                                                           C   
2            8.3.3[1]  2004-12-10                               
3           1.63.1[2]  2022-11-23  HTML, CSS, JavaScript, C++   
4           15.0.3[3]  2024-04-08  Objective-C, Objective-C++   

                                          Cost (US$)  \
0                                            No cost   
1                                            No cost   
2                                     

## 파이썬과 통합: 위키피디아 자료를 MySQL 저장

In [12]:
from urllib.request import urlopen
from bs4 import BeautifulStoneSoup
import random
import pymysql
import re
import collections

collections.Callable = collections.abc.Callable

def store(conn, cur, title, content):
    cur.execute('insert into pages (title, content) values ("%s", "%s")', (title, content))
    conn.commit()

def get_links(conn, cur, articleUrl):
    html = urlopen('http://en.wikipedia.org' + articleUrl)
    bs = BeautifulSoup(html, 'html.parser')

    title = bs.find('h1').text
    content = bs.find('div', {'id' : 'mw-content-text'}).find('p').text
    print(title, content)

    # find()로 검색된 데이터를 데이터베이스에 저장
    store(conn, cur, title, content)

    return bs.find('div', {'id':'bodyContent'}).\
    find_all('a', href = re.compile('^(/wiki/)((?!:).)*$'))

def main():
    conn = pymysql.connect(host = 'localhost', user = 'joo9810',
                           password = 'signcity114', db = 'scraping', charset = 'utf8')
    cur = conn.cursor()
    random.seed(None)

    links = get_links(conn, cur, '/wiki/Kevin_Bacon')
    try:
        while len(links) > 0:
            newArticle = links[random.randint(0, len(links)-1)].attrs['href']
            print(newArticle)
            links = get_links(conn, cur, newArticle)
    finally:
        cur.close()
        conn.close()

main()

Kevin Bacon 

/wiki/Gary_Sinise
Gary Sinise 

/wiki/Joe_Pesci
Joe Pesci 

/wiki/Daniel_Stern_(actor)
Daniel Stern (actor) Daniel Jacob Stern (born August 28, 1957)[1] is an American actor, artist, director, comedian, and screenwriter. He is best known for his roles as Marv Murchins in Home Alone (1990) and Home Alone 2: Lost in New York (1992), Phil Berquist in City Slickers (1991) and City Slickers II: The Legend of Curly's Gold (1994), the voice of adult Kevin Arnold on the television series The Wonder Years, and the voice of Dilbert on the animated series of the same name. Other notable films of his include Breaking Away (1979), Stardust Memories (1980), Diner (1982), Blue Thunder (1983), Hannah and Her Sisters (1986), The Milagro Beanfield War (1988), Coupe de Ville (1990), and Very Bad Things (1998). He made his feature-film directorial debut with Rookie of the Year (1993).

/wiki/I%27m_Dancing_as_Fast_as_I_Can
I'm Dancing as Fast as I Can I'm Dancing as Fast as I Can is a 1982 Am

DataError: (1366, "Incorrect string value: '\\xF0\\x90\\x9E\\x92\\xC7\\x80...' for column 'content' at row 1")