## 安裝套件

In [None]:
!pip install requests

In [None]:
!pip install feedparser

In [None]:
!pip install lxml

In [None]:
!pip install jieba

### 下載繁體字的辭典檔

In [None]:
!wget https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big -O jieba_data/dict.txt.big

In [None]:
!pip install wordcloud

## 解析 RSS

In [None]:
import feedparser

In [None]:
rss_url = "https://udn.com/rssfeed/news/2/7225?ch=news"

In [None]:
newsFeed = feedparser.parse(rss_url)
newsFeed

### 轉成 json 格式，查看 RSS 的內容

In [None]:
import json
print(json.dumps(newsFeed, indent=1, ensure_ascii=False))

### 查看每個 entry 的 title, url 

In [None]:
i = 1
for e in newsFeed['entries']:
    title = e['title']
    link_url = e['links'][0]['href']
    print("%s, %s, %s"%(i, title, link_url))
    i = i + 1

### 解析html可以參考 https://lxml.de/lxmlhtml.html

## 開始取得文章

### 載入 library

In [None]:
import requests
from lxml import etree
from io import StringIO
import jieba
import pandas as pd
import time
import random
from http import cookiejar

### 使用繁體辭典

In [None]:
#指定辭典檔
jieba.set_dictionary('jieba_data/dict.txt.big')

### 設定 request header

In [None]:
#設定 request header
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
my_headers = {
    'User-Agent': user_agent,
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7",
    "accept-encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "cache-control": "no-cache",
    "Accept-Charset": "UTF8,utf-8;q=0.7,*;q=0.7"
}

### 設定 cookie

In [None]:
#設定 cookie
my_cookie = cookiejar.CookieJar()

### 先爬文

In [None]:
#分解動作

#爬文
each_article_text_list = []
for e in newsFeed['entries']:
    url = e['links'][0]['href']
    print(url)
    r = requests.get(url, headers = my_headers, cookies = my_cookie)
    if r.status_code == 200:
        parse_tree = etree.parse(StringIO(r.text), etree.HTMLParser())
        article_elements = parse_tree.xpath('//article//p')
        for a_part in article_elements:
            if type(a_part.text) is str:
                each_article_text_list.append(a_part.text.strip())
        sleep_time = random.randint(3,10)
        print("sleep time: %s sec"%(sleep_time))
        time.sleep(sleep_time)
    all_article_text = ''.join(each_article_text_list)
all_article_text

### 做分詞

In [None]:
#分詞
seg_words_list = jieba.lcut(all_article_text)
seg_words_list

In [None]:
!ls jieba_data

### 準備 stop word

In [None]:
#stop word
with open(file='jieba_data/stop_words.txt', mode='r', encoding='utf-8') as file:
    stop_words = file.read().split('\n')
stop_words

### 準備一個利用stop word過濾後的分詞結果

In [None]:
#分詞
seg_stop_words_list = []
seg_words_list = jieba.lcut(all_article_text)
for term in seg_words_list:
    if term not in stop_words:
        seg_stop_words_list.append(term)
seg_stop_words_list

In [None]:
#合併動作 XD
each_article_text_list = []
seg_words_list = []
for e in newsFeed['entries']:
    url = e['links'][0]['href']
    print(url)
    r = requests.get(url, headers = my_headers, cookies = my_cookie)
    if r.status_code == 200:
        parse_tree = etree.parse(StringIO(r.text), etree.HTMLParser())
        article_elements = parse_tree.xpath('//article//p')
        for a_part in article_elements:
            if type(a_part.text) is str:
                term = a_part.text.strip()
                if term not in stop_words:
                    each_article_text_list.append(term)
        all_article_text += ''.join(each_article_text_list)
        sleep_time = random.randint(3,10)
        print("sleep time: %s sec"%(sleep_time))
        time.sleep(sleep_time)
seg_words_list = jieba.lcut(all_article_text)
seg_df = pd.DataFrame(seg_words_list, columns=['seg_word'])
print(seg_df)

### 載入繪圖相關的 library

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
print(matplotlib.matplotlib_fname())

### 安裝中文字型

In [None]:
!ls fonts/*.ttf

In [None]:
!cat /opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/matplotlibrc

#### 字型檔案是放在 /opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf 我們先列出來看一下

In [None]:
!ls /opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf

#### 將中文字型檔案放到字型收容的路徑

In [None]:
!cp fonts/*.ttf /opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf 

#### 檢查一下結果

In [None]:
!ls /opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf

#### 清除 matplotlib 快取

In [None]:
print(matplotlib.get_configdir())

In [None]:
!ls -la /home/jovyan/.config/matplotlib

In [None]:
print(matplotlib.get_cachedir())

In [None]:
!rm -rf /home/jovyan/.cache/matplotlib

In [None]:
#用來顯示中文
plt.rcParams['font.sans-serif']=['SimHei']
#用來顯示負號
plt.rcParams['axes.unicode_minus']=False

#### 其他 FAQ 可以參考 https://matplotlib.org/3.1.1/faq/troubleshooting_faq.html

### 繪製文字雲 ( wordcloud.generate )

In [None]:
# wordcloud.generate 參數是吃文字內容，以空白為文字區隔
seg_words = ' '.join(seg_words_list)
#seg_words

In [None]:
wordcloud = WordCloud(font_path='fonts/SimHei.ttf').generate(seg_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(font_path='fonts/TaipeiSansTCBeta-Regular.ttf').generate(seg_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### 繪製文字雲 ( wordcloud.generate_from_frequencies )

In [None]:
# https://docs.python.org/zh-tw/3/library/collections.html
# 準備一個沒有經過stop word 處理的 counter
from collections import Counter
seg_counter = Counter(seg_words_list)
seg_counter

In [None]:
# 準備一個有經過stop word 處理的 counter
from collections import Counter
seg_stop_counter = Counter(seg_stop_words_list)
#seg_stop_counter

In [None]:
wordcloud = WordCloud(font_path='fonts/TaipeiSansTCBeta-Regular.ttf').generate_from_frequencies(seg_counter)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(font_path='fonts/TaipeiSansTCBeta-Regular.ttf').generate_from_frequencies(seg_stop_counter)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### [其他] 用dataframe 處理數據

In [None]:
seg_df = pd.DataFrame(seg_words_list, columns=['seg_word'])
seg_df

In [None]:
seg_df['count'] = 1
seg_freq_df = seg_df.groupby('seg_word').sum()
seg_freq_df

In [None]:
seg_df.groupby('seg_word').sum().plot.bar()
plt.xticks(rotation=50)
plt.show()

### [其他] parse html 小筆記

In [None]:
import requests
from lxml import etree
import jieba
from io import StringIO
import pandas as pd

for e in newsFeed['entries']:
    url = e['links'][0]['href']
    print(url)
    r = requests.get(url)
    #print(r.status_code == 200)
    if r.status_code == 200:
        html_tree = etree.HTML(r.text)
        print(html_tree)
        html_ele = html_tree.xpath('//article//p')
        all_ele = [ (ele.text.strip()) for ele in html_ele if type(ele.text) is str ]
        all_ele_string = ''.join(all_ele)
        seg_words = jieba.lcut(all_ele_string)
        seg_df = pd.DataFrame(seg_words, columns=['seg_word'])
        seg_df['count'] = 1
        print(seg_df)