# Full text search and keyword association analysis

全文檢索與關聯分析

# Load preprocessed news dataset

In [213]:
import pandas as pd

In [214]:
df = pd.read_csv('news.csv',sep='|')

In [215]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...


# Filter data by searching keywords from "content" column

## "in" is very powerful

In [216]:
df.content[0]

'美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%。台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％。凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸。凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股。'

In [217]:
qk = '美國總統川普'
text = df.content[0]
qk in text

True

In [218]:
qk = '個股'
text = df.content[0]
qk in text

True

In [219]:
qk = '上漲趨勢'
text = df.content[0]
qk in text

False

In [220]:
qk = '上漲'
text = df.content[0]
qk in text

True

### all() any()

In [221]:
user_keywords = ['上漲','下跌']
text = df.content[0]
all((qk in text) for qk in user_keywords)

True

In [222]:
user_keywords = ['台股下跌','指數']
text = df.content[0]
any((qk in text) for qk in user_keywords)

True

In [223]:
user_keywords = ['美股','上漲']
text = df.content[0]
any((qk in text) for qk in user_keywords)

True

### Using apply() and lambda function

In [224]:
# Use apply() and lambda function
user_keywords = ['下跌','上漲']
df.content.apply(lambda text: all((qk in text) for qk in user_keywords))

0       True
1      False
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: content, Length: 216, dtype: bool

# Filter data using the following function

In [225]:
from datetime import datetime, timedelta

In [226]:
from datetime import datetime, timedelta
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # (1) proceed filtering: a duration of a period of time
    # 期間條件
    period_condition = (df.date >= start_date) & (df.date <= end_date) 
    
    # (2) proceed filtering: news category
    # 新聞類別條件
    if (cate == "全部"):
        condition = period_condition  # "全部"類別不必過濾新聞種類
    else:
        # category新聞類別條件
        condition = period_condition & (df.category == cate)

    # (3) proceed filtering: keywords 
    # and or 條件
    if (cond == 'and'):
        # query keywords condition使用者輸入關鍵字條件and
        condition = condition & df.content.apply(lambda text: all((qk in text) for qk in user_keywords)) #寫法:all()
    elif (cond == 'or'):
        # query keywords condition使用者輸入關鍵字條件
        condition = condition & df.content.apply(lambda text: any((qk in text) for qk in user_keywords)) #寫法:any()
    # condiction is a list of True or False boolean value
    df_query = df[condition]

    return df_query


In [227]:
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText_v0(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) 
            & df.content.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.apply(lambda text: any((qk in text) for qk in user_keywords))]
    elif (cond == 'and'):
        df_query = df[(df.category == cate) 
            & (df.date >= start_date) & (df.date <= end_date) 
            & df.content.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cond == 'or'):
        df_query = df[(df.category == cate) 
            & (df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.apply(lambda text: any((qk in text) for qk in user_keywords))]

    return df_query

In [228]:
user_keywords = ['台股','美股']
cond='and'
cate='全部'
weeks=2

df_query = filter_dataFrame_fullText(user_keywords, cond, cate,weeks)
df_query.shape

(3, 14)

In [229]:
user_keywords = ['台灣']
cond='and'
cate='全部'
weeks=2

df_query = filter_dataFrame_fullText(user_keywords, cond, cate,weeks)
df_query.shape

(106, 14)

# Get news title, category, and link

In [230]:
user_keywords = ['股票','指數']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)
len(df_query)

3

In [231]:
df_query

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...
126,finance_20250324_7,2025-03-24,財經,股市一直跌誰最慘？「1類人」恐成海嘯第一排　專家曝3方法降低風險三立新聞網 setn.com...,財經中心／彭淇昀報導市場對川普政府政策可能導致經濟衰退的擔憂加劇，標普500指數在2月19日...,暫無,暫無,"[('市場', 4), ('股市', 4), ('嬰兒潮', 4), ('世代', 4), ...","['財經', '中心', '／', '彭淇昀', '報導', '市場', '對', '川普'...","['財經', '中心', '彭淇昀', '市場', '川普', '政府', '政策', '經...","[NerToken(word='彭淇昀', ner='PERSON', idx=(5, 8)...","[('財經', 'Na'), ('中心', 'Nc'), ('／', 'FW'), ('彭淇...",https://tw.news.yahoo.com//%E6%94%94%E6%9F%A5%...,https://s.yimg.com/ny/api/res/1.2/nMW1g5vNXghG...
132,finance_20250327_13,2025-03-27,財經,【清明節變盤2】 川普關稅風暴避風港！ 5名師點名3族群抱股過節Yahoo奇摩股市15 則留...,4月3日至6日為清明連假，適逢美國總統川普預定4月2日實施對等關稅，雖然他近日鬆口對於進口商...,暫無,暫無,"[('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16...","['4月', '3日', '至', '6日', '為', '清明', '連假', '，', ...","['美國', '總統', '川普', '關稅', '商品', '關稅', '國家', '汽車...","[NerToken(word='4月3日至6日', ner='DATE', idx=(0, ...","[('4月', 'Nd'), ('3日', 'Nd'), ('至', 'Caa'), ('6...",https://tw.news.yahoo.com//%E4%B8%AD%E5%9C%8B%...,https://s.yimg.com/ny/api/res/1.2/1FcXAGsHow02...


In [232]:
for i in range(3):   
    print(i)
    print(df_query.iloc[i]['category'])
    print(df_query.iloc[i]['title'])
    print(df_query.iloc[i].link)
    print(df_query.iloc[i].photo_link)

0
焦點
台股重挫308點 失守22000關卡
https://tw.news.yahoo.com/https://tw.stock.yahoo.com/news/%E5%B7%9D%E6%99%AE%E8%A6%81%E6%94%B6%E6%B1%BD%E8%BB%8A%E9%97%9C%E7%A8%85%EF%BC%81-%E5%8F%B0%E8%82%A1%E6%94%B6%E8%B7%8C308%E9%BB%9E%E3%80%81%E5%8F%B0%E7%A9%8D%E9%9B%BB%E6%8C%AB22%E5%85%83%E8%87%B3958%E5%85%83-010329005.html
https://s.yimg.com/ny/api/res/1.2/3qEveVGKp0705iucq1UPTw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDtoPTQyNw--/https://s.yimg.com/os/creatr-uploaded-images/2024-10/83267440-8445-11ef-b675-1135712e37b5
1
財經
股市一直跌誰最慘？「1類人」恐成海嘯第一排　專家曝3方法降低風險三立新聞網 setn.com15 則留言3 天前
https://tw.news.yahoo.com//%E6%94%94%E6%9F%A5%E9%81%95%E5%81%9C%E6%84%8F%E5%A4%96%E8%B5%B7%E7%8D%B2%E6%A7%8D%E6%AF%92-%E6%96%B0%E8%8E%8A%E8%AD%A6%E9%80%AE1%E7%94%B7%E9%80%81%E8%BE%A6-064723625.html
https://s.yimg.com/ny/api/res/1.2/nMW1g5vNXghGOuG_fqWFWw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDtoPTM2MA--/https://media.zenfs.com/zh-tw/setn.com.tw/e351d8b5dc1a1b283dcff2fe94ef2b01
2
財經
【清明節變盤2】 川普關稅風暴避風港！ 5名師點名3族群抱股過節Yahoo奇摩股市15 則留言5 小時前
https://tw.n

In [233]:
df_query.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...


## All-in-one function to return category, title, link, and photo_link

"photo_link" will be use in the next app

In [234]:
user_keywords = ['上漲','下跌']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)

In [235]:
len(df_query)

5

In [236]:
# get titles and links from k pieces of news 
def get_title_link_topk(df_query, k=5):
    items = []
    for i in range( len(df_query[0:k]) ): # show only 5 articles
        category = df_query.iloc[i]['category']
        title = df_query.iloc[i]['title']
        link = df_query.iloc[i]['link']
        photo_link = df_query.iloc[i]['photo_link']
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link='' # 若沒圖片，就設定為空字串，在前端網頁解讀json格式時才不會錯誤
        
        item_info = {
            'category': category, 
            'title': title, 
            'link': link, 
            'photo_link': photo_link
        }

        items.append(item_info)
    return items 

In [237]:
get_title_link_topk(df_query, 3)

[{'category': '焦點',
  'title': '台股重挫308點 失守22000關卡',
  'link': 'https://tw.news.yahoo.com/https://tw.stock.yahoo.com/news/%E5%B7%9D%E6%99%AE%E8%A6%81%E6%94%B6%E6%B1%BD%E8%BB%8A%E9%97%9C%E7%A8%85%EF%BC%81-%E5%8F%B0%E8%82%A1%E6%94%B6%E8%B7%8C308%E9%BB%9E%E3%80%81%E5%8F%B0%E7%A9%8D%E9%9B%BB%E6%8C%AB22%E5%85%83%E8%87%B3958%E5%85%83-010329005.html',
  'photo_link': 'https://s.yimg.com/ny/api/res/1.2/3qEveVGKp0705iucq1UPTw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDtoPTQyNw--/https://s.yimg.com/os/creatr-uploaded-images/2024-10/83267440-8445-11ef-b675-1135712e37b5'},
 {'category': '焦點',
  'title': '美國經濟衰退危機將至？巴菲特示警散戶快做5件事：當別人貪婪時要恐懼',
  'link': 'https://tw.news.yahoo.com/%E5%8F%B0%E5%8D%97%E9%80%A3%E9%8E%96%E7%89%9B%E6%8E%92%E5%BA%97%E7%94%A8%E9%A4%90-5%E6%AD%B2%E7%AB%A5-%E6%80%8E%E9%BA%BC%E7%A1%AC%E7%A1%AC%E7%9A%84-%E4%BB%96%E5%90%90%E5%87%BA-%E7%99%BC%E9%BB%91%E8%9E%BA%E7%B5%B2-065600607.html',
  'photo_link': 'https://s.yimg.com/ny/api/res/1.2/eaVvc8.Hhr6Wk7SVnme59Q--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDto

### Some photo_links are "NaN". 

If the photo_link value is 'nan', it will be a problem when converted to json format on Django!

    "NaN" cannot be converted to a JSON string. 

In [238]:
df_query.iloc[0]['photo_link']

'https://s.yimg.com/ny/api/res/1.2/3qEveVGKp0705iucq1UPTw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDtoPTQyNw--/https://s.yimg.com/os/creatr-uploaded-images/2024-10/83267440-8445-11ef-b675-1135712e37b5'

### Test the photo_link is NaN or not and replace it with empty string
    How to do?
    You can test a variable is "NaN" or not by using pd, np or math.

In [239]:
import pandas as pd
import numpy as np
import math

# we can use pandas, numpy or math to check if photo_link is NaN
x = float("nan")

print(f"It's pd.isna  : {pd.isna(x)}")
print(f"It's np.isnan  : {np.isnan(x)}")
print(f"It's math.isnan : {math.isnan(x)}")

It's pd.isna  : True
It's np.isnan  : True
It's math.isnan : True


In [240]:
df_query.iloc[0]['photo_link']

'https://s.yimg.com/ny/api/res/1.2/3qEveVGKp0705iucq1UPTw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDtoPTQyNw--/https://s.yimg.com/os/creatr-uploaded-images/2024-10/83267440-8445-11ef-b675-1135712e37b5'

In [241]:
photo_link = df_query.iloc[0]['photo_link']
# if photo_link value is NaN, replace it with empty string 
if pd.isna(photo_link):
    photo_link='' # 

In [242]:
photo_link

'https://s.yimg.com/ny/api/res/1.2/3qEveVGKp0705iucq1UPTw--/YXBwaWQ9aGlnaGxhbmRlcjt3PTY0MDtoPTQyNw--/https://s.yimg.com/os/creatr-uploaded-images/2024-10/83267440-8445-11ef-b675-1135712e37b5'

# Find some related keywords

    Find related words from the top_key_freq column
    相關詞有哪一些? 找出各篇文章的topk關鍵詞?

## All-in-one function: Get related keywords

In [243]:
from collections import Counter
# 相關詞有哪一些?找出各篇文章的topk關鍵詞加以彙整計算
# 不能用 "get_related_keys"當函數名稱，因為這是Django系統用的名稱
def get_related_words(df_query):
    counter=Counter() # this counter is for all articles
    for idx in range(len(df_query)):
        pair_dict = dict(eval(df_query.iloc[idx].top_key_freq))
        counter += Counter(pair_dict)
    return counter.most_common(20) #return list format

In [244]:
# This version is for reference
def get_related_words_v0(df_query):
    all_pairs={}
    for idx in range(len(df_query)):
        row = df_query.iloc[idx].top_key_freq
        pairs = eval(row)
        for pair in pairs:
            w,f = pair
            if w in all_pairs:
                all_pairs[w]+= f
            else:
                all_pairs[w] = f

    counter = Counter(all_pairs)
    return counter.most_common(20) #return list format
    #return dict(counter.most_common(20)) #return dict format

In [245]:
user_keywords = ['美股','台股']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)

In [246]:
df_query.shape

(3, 14)

In [247]:
get_related_words(df_query)

[('關稅', 34),
 ('台股', 32),
 ('市場', 23),
 ('美國', 21),
 ('指數', 18),
 ('川普', 17),
 ('汽車', 14),
 ('族群', 12),
 ('政策', 10),
 ('全球', 8),
 ('股市', 7),
 ('新聞網', 6),
 ('房屋', 6),
 ('股價', 6),
 ('終場', 5),
 ('機會', 5),
 ('資金', 5),
 ('記者', 5),
 ('建築', 5),
 ('工廠', 5)]

In [248]:
# get_related_words_v1(df_query)

## A Step by step demonstration (do it yourself)

In [249]:
user_keywords = ['台股','美股']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)

In [250]:
df_query.head(2)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...
132,finance_20250327_13,2025-03-27,財經,【清明節變盤2】 川普關稅風暴避風港！ 5名師點名3族群抱股過節Yahoo奇摩股市15 則留...,4月3日至6日為清明連假，適逢美國總統川普預定4月2日實施對等關稅，雖然他近日鬆口對於進口商...,暫無,暫無,"[('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16...","['4月', '3日', '至', '6日', '為', '清明', '連假', '，', ...","['美國', '總統', '川普', '關稅', '商品', '關稅', '國家', '汽車...","[NerToken(word='4月3日至6日', ner='DATE', idx=(0, ...","[('4月', 'Nd'), ('3日', 'Nd'), ('至', 'Caa'), ('6...",https://tw.news.yahoo.com//%E4%B8%AD%E5%9C%8B%...,https://s.yimg.com/ny/api/res/1.2/1FcXAGsHow02...


In [251]:
df_query.top_key_freq

0      [('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...
132    [('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16...
137    [('市場', 7), ('股價', 6), ('台股', 5), ('指數', 4), (...
Name: top_key_freq, dtype: object

### From the word_freq pairs, we sum frequency for each word

    How? 
    The best way is to use dict. Here is a simple example.

In [252]:
[('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16)]

[('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16)]

In [253]:
'''
{'陳時中': 7,
 '解封': 5,
 '疫情': 10,
 '檢疫期': 3}

 (key, value) key can't be duplicated.鍵值不能重複，轉成dict時不處理重複的，會被丟棄
'''

dict([('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16)])

{'關稅': 33, '台股': 25, '美國': 17, '市場': 16}

In [254]:
#  (key, value) key can't be duplicated.鍵值不能重複，轉成dict時不處理重複的，會被丟棄
dict([('美股', 33), ('台股', 25), ('美股', 30)])

{'美股': 30, '台股': 25}

In [255]:
c1 = Counter(dict([('關稅', 33), ('台股', 25), ('美國', 17), ('市場', 16)]))

In [256]:
c1

Counter({'關稅': 33, '台股': 25, '美國': 17, '市場': 16})

In [257]:
c2 = Counter(dict([('市場', 7), ('股價', 6), ('台股', 5)]))

In [258]:
c2

Counter({'市場': 7, '股價': 6, '台股': 5})

In [259]:
c1+c2

Counter({'關稅': 33, '台股': 30, '市場': 23, '美國': 17, '股價': 6})

In [260]:
### Now we can start to cout frequency from our word_freq pairs
df_query.iloc[0].top_key_freq

"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ('川普', 2), ('汽車', 2), ('美股', 2), ('台北', 2), ('終場', 2), ('台股', 2), ('關卡', 2), ('大盤', 2), ('壓力', 2), ('均線', 2), ('型態', 2), ('個股', 2), ('總統', 1), ('關稅', 1), ('現股', 1), ('早盤', 1)]"

In [261]:
# Convert to dictionary
dict(eval(df_query.iloc[0].top_key_freq))

{'台積電': 4,
 '指數': 3,
 '月線': 3,
 '美國': 2,
 '川普': 2,
 '汽車': 2,
 '美股': 2,
 '台北': 2,
 '終場': 2,
 '台股': 2,
 '關卡': 2,
 '大盤': 2,
 '壓力': 2,
 '均線': 2,
 '型態': 2,
 '個股': 2,
 '總統': 1,
 '關稅': 1,
 '現股': 1,
 '早盤': 1}

In [262]:
c1 = Counter(dict(eval(df_query.iloc[0].top_key_freq)))
c1.most_common(10)

[('台積電', 4),
 ('指數', 3),
 ('月線', 3),
 ('美國', 2),
 ('川普', 2),
 ('汽車', 2),
 ('美股', 2),
 ('台北', 2),
 ('終場', 2),
 ('台股', 2)]

In [263]:
c2 = Counter(dict(eval(df_query.iloc[1].top_key_freq)))
c2.most_common(10)

[('關稅', 33),
 ('台股', 25),
 ('美國', 17),
 ('市場', 16),
 ('川普', 15),
 ('汽車', 12),
 ('指數', 11),
 ('政策', 10),
 ('族群', 8),
 ('全球', 8)]

In [264]:
counter = c1+c2
counter.most_common(10)

[('關稅', 34),
 ('台股', 27),
 ('美國', 19),
 ('川普', 17),
 ('市場', 16),
 ('指數', 14),
 ('汽車', 14),
 ('政策', 10),
 ('族群', 8),
 ('全球', 8)]

In [265]:
# Advance operation for reference
# Using the itertools.groupby approach, you are summing the sorted groups based on first tuple elements.

# from itertools import groupby
# from operator import itemgetter

# my_list = [('a',2),('a',3),('b',3),('c',2),('b',4)]
# first = itemgetter(0)
# sums = [(k, sum(item[1] for item in tups_to_sum))
#         for k, tups_to_sum in groupby(sorted(my_list, key=first), key=first)]
# Outputs:

# [('a', 5), ('b', 7), ('c', 2)]

## Prepare wordcloud data

In [266]:
# Get related keywords by counting the top keywords of each news.
# Notice:  do not name function as  "get_related_keys",
# because this name is used in Django
def get_related_word_clouddata(df_query):

    # (1) Get wf_pairs by calling get_related_words().
    wf_pairs = get_related_words(df_query)
    
    # (2) cloud chart data
    # the minimum and maximum frequency of top words
    min_ = wf_pairs[-1][1]  # the last line is smaller
    max_ = wf_pairs[0][1]
    # text size based on the value of word frequency for drawing cloud chart
    textSizeMin = 20 # 最小字
    textSizeMax = 120 # 最大字
    # Scaling frequency value into an interval of from 20 to 120.
    clouddata = [{'text': w, 'size': int(textSizeMin + (f - min_) / (max_ - min_) * (textSizeMax - textSizeMin))}
                 for w, f in wf_pairs]

    return   wf_pairs, clouddata 

In [267]:
user_keywords = ['台股','美股']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)

get_related_word_clouddata(df_query)

([('關稅', 34),
  ('台股', 32),
  ('市場', 23),
  ('美國', 21),
  ('指數', 18),
  ('川普', 17),
  ('汽車', 14),
  ('族群', 12),
  ('政策', 10),
  ('全球', 8),
  ('股市', 7),
  ('新聞網', 6),
  ('房屋', 6),
  ('股價', 6),
  ('終場', 5),
  ('機會', 5),
  ('資金', 5),
  ('記者', 5),
  ('建築', 5),
  ('工廠', 5)],
 [{'text': '關稅', 'size': 120},
  {'text': '台股', 'size': 113},
  {'text': '市場', 'size': 82},
  {'text': '美國', 'size': 75},
  {'text': '指數', 'size': 64},
  {'text': '川普', 'size': 61},
  {'text': '汽車', 'size': 51},
  {'text': '族群', 'size': 44},
  {'text': '政策', 'size': 37},
  {'text': '全球', 'size': 30},
  {'text': '股市', 'size': 26},
  {'text': '新聞網', 'size': 23},
  {'text': '房屋', 'size': 23},
  {'text': '股價', 'size': 23},
  {'text': '終場', 'size': 20},
  {'text': '機會', 'size': 20},
  {'text': '資金', 'size': 20},
  {'text': '記者', 'size': 20},
  {'text': '建築', 'size': 20},
  {'text': '工廠', 'size': 20}])

# Find paragraphs containing the keywords. 

    There may be too many related paragraphs, so we display only some of them on our Django website
    一一比對文章段落，找出關鍵詞所在的段落

## All-in-one function: Find related paragraphs

In [268]:
import re

# Step1: split paragraphs in text 先將文章切成一個段落一個段落
def cut_paragraph(text):
    paragraphs = text.split('。')  # 遇到句號就切開
    #paragraphs = re.split('。', text) # 遇到句號就切開
    #paragraphs = re.split('[。！!？?]', text) # 遇到句號(也納入問號、驚嘆號、分號等)就切開
    paragraphs = list(filter(None, paragraphs))
    return paragraphs


import re
# Find out all paragraphs where multiple keywords occur.
def get_same_para(df_query, user_keywords, cond, k=30):
    same_para=[]
    for text in df_query.content:
        #print(text)
        paragraphs = cut_paragraph(text)
        for para in paragraphs:
            para += "。"  # 在每段落文字後面加一個句號。
            # 判斷每個段落文字是否包含該關鍵字，and or分開判斷
            if cond == 'and':
                if all([kw in para for kw in user_keywords]):
                    same_para.append(para)  # 符合條件的段落para保存起來
            elif cond == 'or':
                if any([kw in para for kw in user_keywords]):
                    same_para.append(para)  # 符合條件的段落para保存起來
    return same_para[0:k]



# Step2: Select all paragraphs where multiple keywords occur.
def get_same_para(df_query, user_keywords, cond, k=30):
    same_para=[]
    for text in df_query.content:
        #print(text)
        paragraphs = cut_paragraph(text)
        for para in paragraphs:
            para += "。"
            if cond=='and':
                if all([re.search(kw, para) for kw in user_keywords]):
                    same_para.append(para)
            elif cond=='or':
                if any([re.search(kw, para) for kw in user_keywords]):
                    same_para.append(para)
    return same_para[0:k]


In [269]:
user_keywords = ['台灣','美國','股市']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)

In [270]:
len(df_query)

9

In [271]:
get_same_para(df_query, user_keywords, 'and', k=10)

['李鴻基表示，台積電目前占台灣加權指數權值達35%，自2016年8月以來，在台積電驅動下，台灣股市總市值與GDP佔比達284%，遠高於美國(194%)與日本(150%)等地資本市場，突顯台積電在全球半導體市場中的主導地位，以及對台灣經濟穩定增長貢獻。']

## A Step by step demonstration

## Step1: cut_paragraph() function

In [272]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...


In [273]:
text = df.content[0]
text

'美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%。台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％。凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸。凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股。'

In [274]:
# Step1: split paragraphs in text 先將文章切成一個段落一個段落
def cut_paragraph(text):
    paragraphs = text.split('。')  # 遇到句號就切開
    #paragraphs = re.split('。', text) # 遇到句號就切開
    #paragraphs = re.split('[。！!？?]', text) # 遇到句號(也納入問號、驚嘆號、分號等)就切開
    paragraphs = list(filter(None, paragraphs))
    return paragraphs


In [275]:
paragraphs = cut_paragraph(text)
paragraphs

['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點',
 '川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%',
 '台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％',
 '凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸',
 '凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股']

In [276]:
text

'美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%。台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％。凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸。凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股。'

In [277]:
text.split('。')


['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點',
 '川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%',
 '台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％',
 '凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸',
 '凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股',
 '']

In [278]:
paragraphs = text.split('。')
list(filter(None, paragraphs))


['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點',
 '川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%',
 '台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％',
 '凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸',
 '凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股']

In [279]:
re.split('[。！!？?]', text) # regular expression 正規式 正則式

['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點',
 '川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%',
 '台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％',
 '凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸',
 '凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股',
 '']

### How to cut paragraph? 

    (1) Approach 1:
    string.split() (it does not support regex)

    (2)Approach 2:
    Use regular expression正規式 re.split() 
    re.split() works fine

Simpe example to demonstrate the usage of re.split()

In [280]:
# Use string split().  It does not support regex.
# Split stentence using delimiter or separator '。'

text = '這是第1句話。這是第2句話?這是第3句話。'
text.split('。')

['這是第1句話', '這是第2句話?這是第3句話', '']

In [281]:
# It doesn't work. string split() method does not support regex
text.split('[。?]')

['這是第1句話。這是第2句話?這是第3句話。']

In [282]:
import re

In [283]:
re.split('。', "這是第1句話。這是，第2句話?這是--第3句話。")

['這是第1句話', '這是，第2句話?這是--第3句話', '']

In [284]:
# 如果納入多個符號去切割，必須用regular expression
# Here, [abc] will match if the string you are trying to match contains any of the a, b or c . 
# You can also specify a range of characters using - inside square brackets. [a-e] is the same as [abcde] . [1-4] is the same as [1234] .


In [285]:
# separator:。 ?
re.split('[。?]', "這是第1句話。這是，第2句話?這是--第3句話。")

['這是第1句話', '這是，第2句話', '這是--第3句話', '']

In [286]:
# "|" means or 可以加上｜去分隔開來，特別適用於當切割符號是由多個字組成時。
# separator:。 ?
re.split(r'[。|?]', "這是第1句話。這是，第2句話?這是--第3句話。")

['這是第1句話', '這是，第2句話', '這是--第3句話', '']

In [287]:
# "|" means or
# separator:。 ?
re.split('[句話|是]', "這是第1句話。這是，第2句話?這是--第3句話。")

['這', '第1', '', '。這', '，第2', '', '?這', '--第3', '', '。']

#### How to remove the empty elements?

In [288]:
# Do you notice the last element is an empty string?

In [289]:
result = re.split(r'[。?]', "這是第1句話。這是，第2句話?這是--第3句話。")

In [290]:
# Using Python filter function
filter(None, result)

<filter at 0x2ab85d7c310>

In [291]:
list(filter(None, result))

['這是第1句話', '這是，第2句話', '這是--第3句話']

In [292]:
# An alternative way
[item for item in result if item]

['這是第1句話', '這是，第2句話', '這是--第3句話']

## Step 2: Find paragraphs containing keywords

### All-in-one function

In [293]:
import re
# Find out all paragraphs where multiple keywords occur.
def get_same_para(df_query, user_keywords, cond, k=30):
    same_para=[]
    for text in df_query.content:
        #print(text)
        paragraphs = cut_paragraph(text)
        for para in paragraphs:
            para += "。"  # 在每段落文字後面加一個句號。
            # 判斷每個段落文字是否包含該關鍵字，and or分開判斷
            if cond == 'and':
                if all([kw in para for kw in user_keywords]):
                    same_para.append(para)  # 符合條件的段落para保存起來
            elif cond == 'or':
                if any([kw in para for kw in user_keywords]):
                    same_para.append(para)  # 符合條件的段落para保存起來
    return same_para[0:k]


In [294]:
user_keywords = ['美股','台股']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame_fullText(user_keywords, cond, cate, weeks)

In [295]:
len(df_query)

3

In [296]:
get_same_para(df_query, user_keywords, 'and', k=10)

['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。',
 '蘇皓毅強調，川普政策反覆，投資人不要輕易受市場過度悲觀或樂觀情緒左右，在經濟維持一定韌性下，股市回檔，反而是入場門票，而台股自3月11日跌破年線後，目前仍在挑戰年線關卡，短期台股亦將受美股牽動而呈現震盪徘徊走勢。',
 '蘇皓毅提到，今年投資主流仍是受惠AI題材的台股、美股兩大市場，台股不論是上游晶片、中游組裝到下游應用等AI相關供應鏈皆持續看好；而由輝達領軍帶動AI革命的美股，則預期標普500將由以往的七巨頭領漲擴散至更多相關產業個股。',
 '摩爾證券投顧分析師林漢偉表示，今日台股跳空下跌反映美股昨日重挫，算是合理表現，量能因賣壓回溫，約落在2700億元，不過今日開低走高難度高，若能留住150點的下影線就非常強勢了。']

### Step by step demonstration

In [297]:
df_query.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...


In [298]:
text = df_query.content.iloc[0]
text

'美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%。台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％。凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸。凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股。'

In [299]:
paragraphs = cut_paragraph(text)
paragraphs

['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點',
 '川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%',
 '台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％',
 '凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸',
 '凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股']

In [314]:
# Find out all paragraphs where multiple keywords occur.
user_keywords = ['美股','台股']
cond='and'
same_para=[] # 存放含有關鍵字的段落
for para in paragraphs:
    para += "。" # 在每段落文字後面加一個句號。
    # 判斷每個段落文字是否包含該關鍵字，and or分開判斷
    if cond=='and': 
        if all([kw in para for kw in user_keywords]):
            same_para.append(para) # 符合條件的段落para保存起來
    elif cond=='or':
        if any([kw in para for kw in user_keywords]):
            same_para.append(para)  # 符合條件的段落para保存起來
same_para


['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。']

### multiple words in text (easier way)

In [321]:
para = '美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。'

In [322]:
user_keywords = ['美股','台股']

In [323]:
any([kw in para for kw in user_keywords])


True

In [324]:
all([kw in para for kw in user_keywords])


True

In [325]:
user_keywords = ['美國','川普']

In [326]:
any([kw in para for kw in user_keywords])


True

In [327]:
all([kw in para for kw in user_keywords])


True

### re.search(): An alternative way

In [328]:
# An alternative way for advanced users: using re.seach()
# Alternative approach using re.search() for reference
user_keywords = ['美股','台股']
cond='and'
same_para=[] # 存放含有關鍵字的段落
for para in paragraphs:
    para += "。"
    if cond=='and':
        if all([re.search(kw, para) for kw in user_keywords]):
            same_para.append(para)
    elif cond=='or':
        if any([re.search(kw, para) for kw in user_keywords]):
            same_para.append(para)
same_para


['美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。']

In [332]:
key = ['美股','台股']

In [333]:
any([re.search(kw, text) for kw in user_keywords])

False

In [312]:
all([re.search(kw, text) for kw in user_keywords])

False

# views.py in Django website

To save memory, we just import df from the other app as follows.
from app_user_keyword.views import df

In [313]:
from django.shortcuts import render
from django.views.decorators.csrf import csrf_exempt
from django.http import JsonResponse

from datetime import datetime, timedelta
import pandas as pd
import math
import re
from collections import Counter

# (1) we can load data using read_csv() 自己app的csv檔案
# global variable
# df = pd.read_csv('dataset/cna_news_preprocessed.csv', sep='|')


# (2) we can load data using reload_df_data() function 隔壁app的csv檔案
# global variable
def load_df_data_v1():
    # global variable
    global  df
    df = pd.read_csv('app_user_keyword/dataset/cna_news_preprocessed.csv', sep='|')

# (3) df can be import from app_user_keyword 隔壁app的變數
# To save memory, we just import df from the other app as follows.
# from app_user_keyword.views import df

# (4) df can be import from app_user_keyword  隔壁app的變數
import app_user_keyword.views as userkeyword_views
def load_df_data():
    # import and use df from app_user_keyword 
    global df # global variable
    df = userkeyword_views.df

load_df_data()


# For the key association analysis
def home(request):
    return render(request, 'app_user_keyword_association/home.html')

# df_query should be global
@csrf_exempt
def api_get_userkey_associate(request):

    userkey = request.POST.get('userkey')
    cate = request.POST['cate']  # This is an alternative way to get POST data.
    cond = request.POST.get('cond')
    weeks = int(request.POST.get('weeks'))
    key = userkey.split()

    #global  df_query # global variable It's not necessary.

    df_query = filter_dataFrame_fullText(key, cond, cate, weeks)
    #print(key)
    print(len(df_query))

    if len(df_query) != 0:  # df_query is not empty
        newslinks = get_title_link_topk(df_query, k=25)
        related_words, clouddata = get_related_word_clouddata(df_query)
        same_paragraph = get_same_para(
            df_query, key, cond, k=30)  # multiple keywords
        num_articles=len(df_query) # total number of articles (stories, items)

    else:
        newslinks = []
        related_words = []
        same_paragraph = []
        clouddata = []
        num_articles=0

    response = {
        'num_articles': num_articles,
        'newslinks': newslinks,
        'related_words': related_words,
        'same_paragraph': same_paragraph,
        'clouddata': clouddata,
    }
    return JsonResponse(response)


# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()

    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() -
                  timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # (1) proceed filtering: a duration of a period of time
    # 期間條件
    period_condition = (df.date >= start_date) & (df.date <= end_date)

    # (2) proceed filtering: news category
    # 新聞類別條件
    if (cate == "全部"):
        condition = period_condition  # "全部"類別不必過濾新聞種類
    else:
        # category新聞類別條件
        condition = period_condition & (df.category == cate)

    # (3) proceed filtering: news category
    # and or 條件
    if (cond == 'and'):
        # query keywords condition使用者輸入關鍵字條件and
        condition = condition & df.content.apply(lambda text: all(
            (qk in text) for qk in user_keywords))  # 寫法:all()
    elif (cond == 'or'):
        # query keywords condition使用者輸入關鍵字條件
        condition = condition & df.content.apply(lambda text: any(
            (qk in text) for qk in user_keywords))  # 寫法:any()
    # condiction is a list of True or False boolean value
    df_query = df[condition]

    return df_query


# get titles and links from k pieces of news 
def get_title_link_topk(df_query, k=25):
    items = []
    for i in range( len(df_query[0:k]) ): # show only 10 news
        category = df_query.iloc[i]['category']
        title = df_query.iloc[i]['title']
        link = df_query.iloc[i]['link']
        photo_link = df_query.iloc[i]['photo_link']
        # if photo_link value is NaN, replace it with empty string 
        if pd.isna(photo_link):
            photo_link=''
        
        item_info = {
            'category': category, 
            'title': title, 
            'link': link, 
            'photo_link': photo_link
        }

        items.append(item_info)
    return items 

# Get related keywords by counting the top keywords of each news.
# Notice:  do not name function as  "get_related_keys",
# because this name is used in Django
def get_related_word_clouddata(df_query):

    # wf_pairs = get_related_words(df_query)
    # prepare wf pairs 
    counter=Counter()
    for idx in range(len(df_query)):
        pair_dict = dict(eval(df_query.iloc[idx].top_key_freq))
        counter += Counter(pair_dict)
    wf_pairs = counter.most_common(20) #return list format

    # cloud chart data
    # the minimum and maximum frequency of top words
    min_ = wf_pairs[-1][1]  # the last line is smaller
    max_ = wf_pairs[0][1]
    # text size based on the value of word frequency for drawing cloud chart
    textSizeMin = 20
    textSizeMax = 120
    # Scaling frequency value into an interval of from 20 to 120.
    clouddata = [{'text': w, 'size': int(textSizeMin + (f - min_) / (max_ - min_) * (textSizeMax - textSizeMin))}
                 for w, f in wf_pairs]

    return   wf_pairs, clouddata 


# Step1: split paragraphs in text 先將文章切成一個段落一個段落
def cut_paragraph(text):
    paragraphs = text.split('。')  # 遇到句號就切開
    #paragraphs = re.split('。', text) # 遇到句號就切開
    #paragraphs = re.split('[。！!？?]', text) # 遇到句號(也納入問號、驚嘆號、分號等)就切開
    paragraphs = list(filter(None, paragraphs))
    return paragraphs

# Step2: Select all paragraphs where multiple keywords occur.


def get_same_para(df_query, user_keywords, cond, k=30):
    same_para = []
    for text in df_query.content:
        #print(text)
        paragraphs = cut_paragraph(text)
        for para in paragraphs:
            para += "。"
            if cond == 'and':
                if all([re.search(kw, para) for kw in user_keywords]):
                    same_para.append(para)
            elif cond == 'or':
                if any([re.search(kw, para) for kw in user_keywords]):
                    same_para.append(para)
    return same_para[0:k]


    
print("app_user_keyword_association was loaded!")


ModuleNotFoundError: No module named 'app_user_keyword'

# For reference