BeautifulSoup 사용하여 HTML 파싱

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

#문자열에서 soup를 생성한다.
soup1 = BeautifulSoup("<HTML><HEAD><<headers>></HEAD><<body>></HTML>")

#또는 로컬파일에서 soup을 생성한다.
#soup2 = BeautifulSoup(open("myDoc.html"))

#또는 웹문서에서 soup을 생성한다.
soup3 = BeautifulSoup(urlopen("http://www.networksciencelab.com/"))


In [None]:
soup1

<html><head></head><body><p>&lt;<headers>&gt;&lt;&gt;</headers></p></body></html>

In [None]:
soup2

<html><head></head><body><p>&lt;<headers>&gt;&lt;&gt;</headers></p></body></html>

In [None]:
soup3

<html>
<head>
<title>My Little Network Science Lab</title>
<link href="style.css" rel="stylesheet" type="text/css"/>
<meta charset="utf-8"/>
</head>
<body>
<h1>My Little Network Science Lab</h1>
<h2>By Dmitry Zinoviev</h2>
<p>
</p><table class="hdr"><tr><td><h3 class="nomargin">Books</h3></td></tr></table>
<p>
<a href="https://pragprog.com/book/dzpyds/data-science-essentials-in-python"><img align="left" border="1" src="https://imagery.pragprog.com/products/490/dzpyds_xlargecover.jpg?1468006361"/></a>
<a href="https://pragprog.com/book/dzcnapy/complex-network-analysis-in-python"><img align="left" border="1" src="https://imagery.pragprog.com/products/541/dzcnapy_xlargecover.jpg?1508250011"/></a>


I am excited to announce my books, "Data Science Essentials in Python. Collect →  Organize →  Explore →  Predict →  Value" (a.k.a. DZPYDS) and "Complex Network Analysis in Python. Recognize → Construct → Visualize → Analyze → Interpret" (a.k.a. DZCNAPY), published by the Pragmatic Bookshelf.
</

find(), find_all() 특정 속성 (href = 하이퍼링크 정보) 값 찾기

In [None]:
#http://www.networksciencelab.com/ 사이트에서 하이퍼링크 정보 모두 가져오기

with urlopen("http://www.networksciencelab.com/") as doc:
  soup = BeautifulSoup(doc)

#하이퍼링크 추출
links = [(link.string, link["href"]) for link in soup.find_all("a") if link.has_attr("href")]
links

[(None, 'https://pragprog.com/book/dzpyds/data-science-essentials-in-python'),
 (None,
  'https://pragprog.com/book/dzcnapy/complex-network-analysis-in-python'),
 ('DZPYDS', 'https://www.amazon.com/gp/product/1680501844'),
 ('DZCNAPY', 'https://www.amazon.com/gp/product/1680502697'),
 ('Networks of Music Groups as Success Predictors',
  'http://www.slideshare.net/DmitryZinoviev/networks-of-music-groups-as-success-predictors'),
 ('Network Science Workshop',
  'http://www.slideshare.net/DmitryZinoviev/workshop-20212296'),
 ('Resilience in Transaction-Oriented Networks',
  'http://www.slideshare.net/DmitryZinoviev/resilience-in-transactional-networks'),
 ('Peer Ratings in Massive Online Social Networks',
  'http://www.slideshare.net/DmitryZinoviev/peer-ratings-in-massive-online-social-networks'),
 ('Semantic Networks of Interests in Online NSSI Communities',
  'http://www.slideshare.net/DmitryZinoviev/presentation-31680572'),
 ('Towards an Ideal Store',
  'http://www.slideshare.net/Dmitry

csv 파일 다루기

statistic 모듈을 사용해서 나이변수의 평균과 표준편차 구하기

In [None]:
#데이터 가져오기


In [None]:
import csv 
with open("/content/Demographic_Statistics_By_Zip_Code.csv") as infile:
  data = list(csv.reader(infile))

In [None]:
#리스트형태로 잘 들어왔나 확인
data

In [None]:
#첫번째 레코드에 COUNT PARTICIPANTS 데이터 인덱스 값 추출 data[0] 컬럼 값
cp_index = data[0].index("COUNT PARTICIPANTS")
cp_index

1

In [None]:
#COUNT PARTICIPANTS 데이터 추출
countParticipants = [int(row[cp_index]) for row in data[1:]]
countParticipants

In [None]:
import statistics
print(statistics.mean(countParticipants), statistics.stdev(countParticipants))

17.661016949152543 43.27973735299687


자연어 처리하기

In [None]:
pip install nltk



In [None]:
import nltk
nltk.__version__

'3.2.5'

nltk 에서 영단어 온톨로지(wordnet) 사용하기

In [None]:
# https://frhyme.github.io/python-lib/nltk-wordnet/ 참조
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
wn = nltk.corpus.wordnet
# corpus reader를 선언

#synset : 유의어 묶음으로 단어-품사-순번 으로 구성
wn.synsets("cat")

[Synset('cat.n.01'),
 Synset('guy.n.01'),
 Synset('cat.n.03'),
 Synset('kat.n.01'),
 Synset('cat-o'-nine-tails.n.01'),
 Synset('caterpillar.n.02'),
 Synset('big_cat.n.01'),
 Synset('computerized_tomography.n.01'),
 Synset('cat.v.01'),
 Synset('vomit.v.01')]

In [None]:
# hypernyms : 상위어
# hyponyms : 하위어
wn.synset('cat.n.01').hypernyms()
wn.synset('cat.n.01').hyponyms()

[Synset('domestic_cat.n.01'), Synset('wildcat.n.03')]

WordNet 사용해서 synset 간 의미론적 유사도 계산하기 (0-1사이 실수)<br>
0 이면 두 단어 서로 관계 없음 <br>
1 이면 완전한 유의어

In [None]:
#고양이, 링스 : 시라소니 유사도 계산

x = wn.synset('cat.n.01') #고양이
y = wn.synset('lynx.n.01') #시라소니

#x와 y의 유사도

x.path_similarity(y)


0.04

In [None]:
#집고양이, 길고양이 유사도 계산

a = wn.synset('domestic_cat.n.01') #집고양이
b = wn.synset('wildcat.n.03') #길고양이

#a와 b의 유사도

a.path_similarity(b)


0.3333333333333333

In [None]:
#강아지, 고양이 유사도 계산
c = wn.synset('cat.n.01') #고양이
d = wn.synset('dog.n.01') #강아지

#c와 d의 유사도

c.path_similarity(d)

0.2

In [None]:
# 1. 토큰화 (텍스트 -> 단어로 쪼갠다.)

from nltk.tokenize import WordPunctTokenizer
word_punct = WordPunctTokenizer()
text = "}Help! :))) :[ ..... :D{"

# Tokenizer는 모든 구두점(punctuation; 문장보호)을 기준으로 분리 => 
# 이모티콘을 이용한 감성 분석 등 문장구조 깊이 분석할 때 사용

word_punct.tokenize(text)

['}', 'Help', '!', ':)))', ':[', '.....', ':', 'D', '{']

In [None]:
# 단어 토크나이저
# 장점 : 단어만 추출할 때 너무 좋음 (이모티콘 무시)
nltk.download('punkt')
nltk.word_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['}', 'Help', '!', ':', ')', ')', ')', ':', '[', '...', '..', ':', 'D', '{']

In [None]:
# 2. 단어의 대소문자를 통일한다 ( 전부 다 대문자 or 소문자)
# 3. 불용어 제거 (stopwords 리스트 참조 THE 같은 것들 제거)
# 4. 형태소 분석 (stemming) 단어를 형태소로 변환

#포터 형태소 분석기 (보수적)
pstemmer = nltk.PorterStemmer()
pstemmer.stem("wonderful")

'wonder'

In [None]:
#랭커스터 형태소 분석기 (적극적) - 더 많은 동음이의어 형태소 생산
lstemmer = nltk.LancasterStemmer()
lstemmer.stem("wonderful")

'wond'

In [None]:
# 5. 원형 추출
lemmatizer = nltk.WordNetLemmatizer()
lemmatizer.lemmatize("wonderful")

'wonderful'

In [None]:
# 그 외 품사 태깅
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(["beautiful", "world"]) # JJ	adjective(형용사), NN	noun(명사)

# https://happygrammer.github.io/nlp/postag-set/ 참조


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('beautiful', 'JJ'), ('world', 'NN')]

index.html 파일에서 (불용어 제외) 가장 많이 등장한 단어 원형 찾아보기

In [None]:
# http://www.networksciencelab.com/ 사이트에서 하이퍼링크 정보 모두 가져오기
from bs4 import BeautifulSoup
from urllib.request import urlopen

with urlopen("http://www.networksciencelab.com/") as doc:
  soup = BeautifulSoup(doc)

# 하이퍼링크 추출
links = [(link.string, link["href"]) for link in soup.find_all("a") if link.has_attr("href")]

# 책 제목만 추출
html_text = [i[0] for i in links]

# None 데이터 제거 
html_text = " ".join(html_text[2:]) #none값이 0번,1번에만 있음 
html_text

'DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publishing in Online

In [None]:
# index.html 파일로 저장
html_file = open("index.html", "w")
html_file.write(html_text)
html_file.close()

Index.html 파일에서(불용어 제외) 가장 많이 등장한 단어 원형 찾아보기

In [None]:
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk import LancasterStemmer

# 형태소 분류기를 생성한다.
ls = nltk.LancasterStemmer()

# 파일을 읽고 soup을 만든다.
with open("/content/index.html") as infile:
  soup = BeautifulSoup(infile)

In [None]:
soup

<html><body><p>DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publis

In [None]:
soup.text

'DZPYDS DZCNAPY Networks of Music Groups as Success Predictors Network Science Workshop Resilience in Transaction-Oriented Networks Peer Ratings in Massive Online Social Networks Semantic Networks of Interests in Online NSSI Communities Towards an Ideal Store D.Zinoviev, "Analyzing Cultural Domains with Python," D. Zinoviev, D. Stefanescu, G. Fireman, and L. Swenson, "Semantic networks of interests in online non-suicidal self-injury communities," D.Zinoviev, "The Pain of Complexity," D.Zinoviev, Z.Zhu, and K.Li, "Building mini-categories in product networks," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Mitigation of delayed management costs in transaction-oriented systems," D.Zinoviev, H.Benbrahim, G.Meszoely, and D.Stefanescu, "Simulating resilience in transaction-oriented networks," D.Zinoviev, D.Stefanescu, L.Swenson, and G.Fireman, "Semantic networks of interests in online NSSI communities," D.Zinoviev and S.Llewelyn, "Co-Evolution of Friendship and Publishing in Online

In [None]:
# 1) 텍스트 추출 -> 토큰화
words = nltk.word_tokenize(soup.text)
words

['DZPYDS',
 'DZCNAPY',
 'Networks',
 'of',
 'Music',
 'Groups',
 'as',
 'Success',
 'Predictors',
 'Network',
 'Science',
 'Workshop',
 'Resilience',
 'in',
 'Transaction-Oriented',
 'Networks',
 'Peer',
 'Ratings',
 'in',
 'Massive',
 'Online',
 'Social',
 'Networks',
 'Semantic',
 'Networks',
 'of',
 'Interests',
 'in',
 'Online',
 'NSSI',
 'Communities',
 'Towards',
 'an',
 'Ideal',
 'Store',
 'D.Zinoviev',
 ',',
 '``',
 'Analyzing',
 'Cultural',
 'Domains',
 'with',
 'Python',
 ',',
 "''",
 'D.',
 'Zinoviev',
 ',',
 'D.',
 'Stefanescu',
 ',',
 'G.',
 'Fireman',
 ',',
 'and',
 'L.',
 'Swenson',
 ',',
 '``',
 'Semantic',
 'networks',
 'of',
 'interests',
 'in',
 'online',
 'non-suicidal',
 'self-injury',
 'communities',
 ',',
 "''",
 'D.Zinoviev',
 ',',
 '``',
 'The',
 'Pain',
 'of',
 'Complexity',
 ',',
 "''",
 'D.Zinoviev',
 ',',
 'Z.Zhu',
 ',',
 'and',
 'K.Li',
 ',',
 '``',
 'Building',
 'mini-categories',
 'in',
 'product',
 'networks',
 ',',
 "''",
 'D.Zinoviev',
 ',',
 'H.Benbr

In [None]:
# 2) 단어를 소문자로 변환
words = [w.lower() for w in words]
words

['dzpyds',
 'dzcnapy',
 'networks',
 'of',
 'music',
 'groups',
 'as',
 'success',
 'predictors',
 'network',
 'science',
 'workshop',
 'resilience',
 'in',
 'transaction-oriented',
 'networks',
 'peer',
 'ratings',
 'in',
 'massive',
 'online',
 'social',
 'networks',
 'semantic',
 'networks',
 'of',
 'interests',
 'in',
 'online',
 'nssi',
 'communities',
 'towards',
 'an',
 'ideal',
 'store',
 'd.zinoviev',
 ',',
 '``',
 'analyzing',
 'cultural',
 'domains',
 'with',
 'python',
 ',',
 "''",
 'd.',
 'zinoviev',
 ',',
 'd.',
 'stefanescu',
 ',',
 'g.',
 'fireman',
 ',',
 'and',
 'l.',
 'swenson',
 ',',
 '``',
 'semantic',
 'networks',
 'of',
 'interests',
 'in',
 'online',
 'non-suicidal',
 'self-injury',
 'communities',
 ',',
 "''",
 'd.zinoviev',
 ',',
 '``',
 'the',
 'pain',
 'of',
 'complexity',
 ',',
 "''",
 'd.zinoviev',
 ',',
 'z.zhu',
 ',',
 'and',
 'k.li',
 ',',
 '``',
 'building',
 'mini-categories',
 'in',
 'product',
 'networks',
 ',',
 "''",
 'd.zinoviev',
 ',',
 'h.benbr

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# 3) (불용어+특수기호)를 제거하고 단어의 형태소를 추출
# 3-1) 불용어 제거: stopwords.words("englist")
# 3-2) 특수기호 제거 : isalnum()
#isalnum() 문자열이 알파벳([a-zA-Z])과 숫자([0-9])로만 구성

words = [ls.stem(w) for w in words if w not in stopwords.words("english") and w.isalnum()]
words

In [None]:
# 4) 가장 빈번하게 등장하는 단어 10개 추출
freqs = Counter(words)
print(freqs.most_common(10))

[('network', 16), ('soc', 8), ('onlin', 7), ('inform', 4), ('gam', 4), ('sem', 3), ('interest', 3), ('commun', 3), ('theoret', 3), ('approach', 3)]
