# User keyword query

    Input 1: user query keywords
    Input 2: condition--and, or

    Output: frequency of the keywords
    (1): How many times are the keywords mentioned? 這個關鍵字被提到多少次?
    (2): How many pieces of news contain (mention) the keywords?  有幾篇新聞提到這個關鍵字?

    First article: ['肺炎','疫情', '肺炎']
    Second article:['陳時中','指揮中心','肺炎','陳時中']

    肺炎: 
    (1) '肺炎' are mentioned three times.  ==> frequency is 3
    (2) Two pieces of news mention '肺炎'. ==> occurrence is 2

    陳時中: 
    (1) '陳時中' are mentioned two times.  ==> frequency is 2
    (2) One pieces of news mention '陳時中'. ==> occurrence is 1
    


# Step 0: Load preprocessed news dataset

In [475]:
import pandas as pd
from datetime import datetime, timedelta

In [476]:
df = pd.read_csv('./yahoo_news_preprocessed.csv',sep='|')

In [477]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...


# Step 1: Select news with user input keywords, category, duration

## (1)Improved Search from "content" column

In [478]:
from datetime import datetime, timedelta
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame(user_keywords, cond, cate, weeks):#關鍵字、and、or、分類、週數

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # (1) proceed filtering: a duration of a period of time
    # 期間條件
    period_condition = (df.date >= start_date) & (df.date <= end_date) 
    
    # (2) proceed filtering: news category
    # 新聞類別條件
    if (cate == "全部"):
        condition = period_condition  # "全部"類別不必過濾新聞種類
    else:
        # category新聞類別條件
        condition = period_condition & (df.category == cate)

    # (3) proceed filtering: keywords 
    # and or 條件
    if (cond == 'and'):
        # query keywords condition使用者輸入關鍵字條件and
        condition = condition & df.content.apply(lambda text: all((qk in text) for qk in user_keywords)) #寫法:all()
    elif (cond == 'or'):
        # query keywords condition使用者輸入關鍵字條件
        condition = condition & df.content.apply(lambda text: any((qk in text) for qk in user_keywords)) #寫法:any()
    # condiction is a list of True or False boolean value
    df_query = df[condition]

    return df_query


## (2)Search keywords from "content" column (Another way)

In [479]:
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText_v0(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) 
            & df.content.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.apply(lambda text: any((qk in text) for qk in user_keywords))]
    elif (cond == 'and'):
        df_query = df[(df.category == cate) 
            & (df.date >= start_date) & (df.date <= end_date) 
            & df.content.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cond == 'or'):
        df_query = df[(df.category == cate) 
            & (df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.apply(lambda text: any((qk in text) for qk in user_keywords))]

    return df_query


## (3) Search from token_v2 (Another way)

In [480]:
# Searching keywords from "token_v2" column
def filter_dataFrame_v0(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        query_df = df[(df.date >= start_date) & (df.date <= end_date) 
            & df.tokens_v2.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        query_df = df[(df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.tokens_v2.apply(lambda text: any((qk in text) for qk in user_keywords))]
    elif (cond == 'and'):
        query_df = df[(df.category == cate) 
            & (df.date >= start_date) & (df.date <= end_date) 
            & df.tokens_v2.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cond == 'or'):
        query_df = df[(df.category == cate) 
            & (df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.tokens_v2.apply(lambda text: any((qk in text) for qk in user_keywords))]
    return query_df

In [481]:
user_keywords=['總統','川普']
cond='and'
cate='全部'
weeks=4
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)
len(df_query)

23

In [482]:
user_keywords=['總統','川普']
cond='and'
cate='國際'
weeks=4
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)
len(df_query)

3

In [483]:
user_keywords=['總統','川普']
cond='or'
cate='政治'
weeks=4
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)
len(df_query)

8

In [484]:
df_query.head()

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
86,politics_20250327_1,2025-03-27,政治,北一女教師登《央視》大罵賴清德！教育局出手了民視472 則留言6 小時前,政治中心／楊佩怡報導北一女教師區桂芝過去因不滿108課綱刪除顧炎武的「廉恥」等古文，而批10...,暫無,暫無,"[('區桂芝', 8), ('教育局', 8), ('北一女', 7), ('中國', 6)...","['政治', '中心', '／', '楊佩怡', '報導', '北一女', '教師', '區...","['政治', '中心', '楊佩怡', '北一女', '教師', '區桂芝', '課綱', ...","[NerToken(word='楊佩', ner='PERSON', idx=(5, 7))...","[('政治', 'Na'), ('中心', 'Nc'), ('／', 'FW'), ('楊佩...",https://tw.news.yahoo.com//%E5%8C%97%E4%BA%AC%...,https://s.yimg.com/ny/api/res/1.2/XH2fYGZl08No...
89,politics_20250327_4,2025-03-27,政治,75學者挺亞亞！律師翻出「3年前反戰聲明」　比對連署名單驚人結果曝光三立新聞網 setn.c...,政治中心／陳慈鈴報導【3/27 09:25 發稿｜09:50 更新：完整連署名單】▲亞亞遭限...,暫無,暫無,"[('台灣', 15), ('言論', 7), ('名單', 5), ('陳培哲', 5),...","['政治', '中心', '／', '陳慈鈴', '報導', '【3/27 09:25 ',...","['政治', '中心', '陳慈鈴', '名單', '亞亞', '松山', '機場', '中...","[NerToken(word='亞亞', ner='PERSON', idx=(42, 44...","[('政治', 'Na'), ('中心', 'Nc'), ('／', 'FW'), ('陳慈...",https://tw.news.yahoo.com//%E9%82%81%E9%98%BF%...,https://s.yimg.com/ny/api/res/1.2/wdF8W4i_dhn0...
90,politics_20250327_5,2025-03-27,政治,躲不過川普「汽車關稅」！新車價格喊漲　驚人數字曝光壹蘋新聞網2 則留言30 分鐘前,【國際中心／綜合報導】美國總統川普今天宣布對所有海外進口車輛一律課徵25%關稅，4月2日正式...,暫無,暫無,"[('汽車', 12), ('美國', 7), ('關稅', 6), ('日本', 4), ...","['【', '國際', '中心', '／', '綜合', '報導', '】', '美國', ...","['國際', '中心', '美國', '總統', '川普', '海外', '車輛', '關稅...","[NerToken(word='美國', ner='GPE', idx=(11, 13)),...","[('【', 'PARENTHESISCATEGORY'), ('國際', 'Nc'), (...",https://tw.news.yahoo.com//%E7%9A%87%E5%90%8E%...,https://s.yimg.com/ny/api/res/1.2/HJyhfoBkXApj...
91,politics_20250327_6,2025-03-27,政治,堅持挺亞亞！律師怒：國家安全這麼脆弱？今日新聞NOWnews283 則留言6 小時前,[NOWnews今日新聞] 中配「亞亞」因發布支持武統言論，遭內政部廢止居留許可，她雖不斷喊...,暫無,暫無,"[('言論', 27), ('台灣', 24), ('中國', 10), ('亞亞', 8)...","['[NOWnews', '今日', '新聞', '] ', '中', '配', '「', ...","['新聞', '亞亞', '武統', '言論', '內政部', '期限', '亞亞', '律...","[NerToken(word='內政部', ner='ORG', idx=(31, 34))...","[('[NOWnews', 'PARENTHESISCATEGORY'), ('今日', '...",https://tw.news.yahoo.com//%E5%85%A5%E5%A2%83%...,https://s.yimg.com/ny/api/res/1.2/Gf_DKwCILDHM...
98,politics_20250326_13,2025-03-26,政治,民進黨與罷團首次開會出現裂痕？與會人士還原經過　曹興誠將親操刀2計畫風傳媒189 則留言23...,民進黨宣布投入大罷免，民進黨政高層24日與全台35個罷團首次會面，加上線上至少有4、50個人...,暫無,暫無,"[('民進黨', 17), ('會議', 8), ('人士', 7), ('縣市', 7),...","['民進黨', '宣布', '投入', '大罷免', '，', '民進黨政', '高層', ...","['民進黨', '民進黨政', '高層', '人士', '會議', '現場', '民進黨',...","[NerToken(word='民進黨', ner='ORG', idx=(0, 3)), ...","[('民進黨', 'Nb'), ('宣布', 'VC'), ('投入', 'VC'), ('...",https://tw.news.yahoo.com//%E7%BE%8E%E5%9C%8B4...,https://s.yimg.com/ny/api/res/1.2/9LY79yyfj6wi...


# Step 2: calculate frequency and occurence

In [485]:
# For the query_df, count the occurence and frequency for each category.

# (1) cate_occurence={}  被多少篇新聞報導 How many pieces of news contain the keywords.
# (2) cate_freq={}       被提到多少次? How many times are the keywords mentioned


news_categories=['全部','焦點','娛樂影劇','國際','政治','社會地方','財經','運動','玩樂','品味','遊戲3C']

def count_keyword(query_df, user_keywords):
    cate_occurence={}
    cate_freq={}

    for cate in news_categories:
        cate_occurence[cate]=0
        cate_freq[cate]=0

    for idx, row in query_df.iterrows():
        # count number of news
        cate_occurence[row.category] += 1
        cate_occurence['全部'] += 1
        
        # count user keyword frequency by checking every word in tokens_v2
        tokens = eval(row.tokens_v2)
        freq =  len([word for word in tokens if (word in user_keywords)])
        cate_freq[row.category] += freq
        cate_freq['全部'] += freq
        
    return cate_freq, cate_occurence

In [486]:
user_keywords=['川普','總統']
cond='or'
cate='全部'
weeks=4
# Step 1 fitering data
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)
len(df_query)

# Step 2: calculating frequency and occurence
count_keyword(df_query, user_keywords)

({'全部': 202,
  '焦點': 67,
  '娛樂影劇': 0,
  '國際': 46,
  '政治': 16,
  '社會地方': 7,
  '財經': 49,
  '運動': 9,
  '玩樂': 2,
  '品味': 1,
  '遊戲3C': 5},
 {'全部': 58,
  '焦點': 24,
  '娛樂影劇': 0,
  '國際': 9,
  '政治': 8,
  '社會地方': 2,
  '財經': 8,
  '運動': 3,
  '玩樂': 1,
  '品味': 1,
  '遊戲3C': 2})

In [487]:
len(df_query)

58

# Demonstrate step by step

How many news are related to "烏克蘭" ?

    How many pieces of news mentioned "烏克蘭"
    How many pieces of news are related to "烏克蘭"?

    You can calculate and get the answer from the following fields: tokens, tokens_v2, or content. (Get very similar results)

    We use "tokens_v2" because it contains only some important keywords which were selected in the pre-process step.

    
A flexible appraoch for And OR condiction

        User all()  any()
        df = pd.DataFrame({'col': ["apple is delicious",
                                "banana is delicious",
                                "apple and banana both are delicious"]})

        targets = ['apple', 'banana']

        # Any word from `targets` are present in sentence.
        >>> df.col.apply(lambda sentence: any(word in sentence for word in targets))
        0    True
        1    True
        2    True
        Name: col, dtype: bool

        # All words from `targets` are present in sentence.
        >>> df.col.apply(lambda sentence: all(word in sentence for word in targets))
        0    False
        1    False
        2     True
        Name: col, dtype: bool

## and &, or | 

In [488]:
True & True

True

In [489]:
True & False

False

In [490]:
True | True

True

In [491]:
True | False

True

## "in" is very powerful in Python!

#### in a string

In [492]:
text = '武漢烏克蘭疫情全球延燒，國防部2月針對29個疫情高風險國家地區勸阻官兵前往（包括過境）。'

In [493]:
'勸阻官兵' in text

True

In [494]:
'延燒，國防部' in text

True

In [495]:
'台灣' in text

False

In [496]:
'烏克蘭' in text

True

In [497]:
# & and
('台灣' in text)  & ('烏克蘭' in text)

False

In [498]:
# & and
('台灣' in text) and ('烏克蘭' in text)

False

In [499]:
('台灣' in text)  | ('烏克蘭' in text)

True

In [500]:
('台灣' in text)  or ('烏克蘭' in text)

True

In [501]:
# This is also a string.
text = "['武漢', '烏克蘭', '疫情', '全球', '延燒', '國防部', '疫情', '高風險', '國家', '地區', '官兵', '過境', '國防部', '政策', '全球', '國家', '地區', '轄下', '單位']"

In [502]:
'烏克蘭' in text

True

In [503]:
'台灣' in text

False

In [504]:
('台灣' in text)  & ('烏克蘭' in text)

False

In [505]:
('台灣' in text)  | ('烏克蘭' in text)

True

#### in a list

In [506]:
user_keyword=['烏克蘭','台灣']
'烏克蘭' in user_keyword

True

In [507]:
# Check out the first news
df.content[0]

'美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%。台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％。凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸。凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股。'

In [508]:
qk = '川普'
text = df.content[0]
qk in text

True

In [509]:
qk = '外交部'
text = df.content[0]
qk in text

False

In [510]:
qk = '台灣'
text = df.content[0]
qk in text

False

In [511]:
text = df.content[0]
('川普' in text) & ('總統' in text )

True

### Another "in" in Python. It is used for "for" loop.

In [512]:
user_keywords=['川普','總統']
text = df.content[0]
[(qk in text) for qk in user_keywords]

[True, True]

In [513]:
user_keywords=['川普','總統']
text = df.content[0]
[(qk in text) for qk in user_keywords]

[True, True]

### all() any() 

    How to perform logical opertion with several conditions? 如何針對很多項去做邏輯運算?

    all(): perform "and" logical opertion 
    any(): perform "or" logical opertion

In [514]:
all( [True, True, True] ) #　True & True

True

In [515]:
any( [True, True] )

True

In [516]:
all( [True, False] )

False

In [517]:
any( [True, False] )

True

In [518]:
user_keywords=['川普','總統']
[word for word in user_keywords]

['川普', '總統']

In [519]:
user_keywords=['川普','總統']
text = '美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%'
[(word in text) for word in user_keywords]

[True, True]

In [520]:
user_keywords=['川普','總統']
text = '美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%'
[word in text for word in user_keywords] # () can be removed

[True, True]

In [521]:
user_keywords=['川普','總統']
text = '美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%'
all([word in text for word in user_keywords])

True

In [522]:
user_keywords=['川普','總統']
text = '美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%'
all(word in text for word in user_keywords) # square brackets [] can be removed

True

In [523]:
# Check out the first news
user_keywords=['川普','總統']
text = df.content[0]
print([(qk in text) for qk in user_keywords])
all((qk in text) for qk in user_keywords)

[True, True]


True

In [524]:
user_keywords=['川普','總統']
text = df.content[0]
any((qk in text) for qk in user_keywords)

True

In [525]:
user_keywords=['川普','總統']
text = df.content[0]
[(qk in text) for qk in user_keywords]

[True, True]

In [526]:
user_keywords=['川普','總統']
text = df.content[0]
all((qk in text) for qk in user_keywords)

True

In [527]:
user_keywords=['川普','總統']
text = df.content[0]
any((qk in text) for qk in user_keywords)

True

## Using apply() and lambda function

How to check out keyword occurency for every news?

In [528]:
# Use apply() and lambda function
user_keywords=['川普','總統']
df.content.apply(lambda text: all([(qk in text) for qk in user_keywords]))

0       True
1       True
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: content, Length: 216, dtype: bool

In [529]:
user_keywords=['川普','總統']

In [530]:
[qk for qk in user_keywords]

['川普', '總統']

In [531]:
text = df.content[0]
text

'美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日早盤以961元開出後失守960價位，終場收在958元，下跌22元或2.24%；台股以22093點開出後失守22000關卡，收盤以21951點作收，下跌308點。川普已經正式宣布，將對所有不在美國製造的汽車徵收25%的進口稅，美股四大指數下跌，費城半導體指數下滑3.27%，台積電ADR收盤大跌4.09%，報每股173.50美元，換算並折合台幣後是每股1,148.15元，較台北交易股票溢價率為17.16%。台股今早盤以22093點開出後，盤中失守22000點關卡，最低觸及21919點，收盤以21951點作收，下跌308點；台積電以961元開出，盤中最低觸及958元，終場以958元作收，下跌22元或2.24％。凱基投顧報告指出，周三大盤高點持續逼近月線壓力，但買盤仍無力追價，成交量急凍至近年新低水準，導致再度挑戰月線未果，不過，盤面上漲家數約占三分之二，有別前一日下跌家數約占七成，顯示盤面籌碼趨於穩定，而且盤中拉回尚能力守短期均線，技術面呈現價穩量縮走勢，有利短線持續挑戰月線，站上月線後就有機會進一步往上挑戰3月13日長黑高點22552點，站上22552點後就能完成底部型態，使反彈走勢進一步延伸。凱基投顧表示，目前盤面類股輪動快速，因此，操作上仍不宜過度追價，但大盤底部型態持續醞釀，部分個股開始率先表態強攻，選股可優先聚焦具利多題材以及股價轉強收復均線壓力之強勢個股。'

In [532]:
[(qk in text) for qk in user_keywords]

[True, True]

In [533]:
# 
all([(qk in text) for qk in user_keywords])

True

In [534]:
# Square brackets can be removed
all((qk in text) for qk in user_keywords)

True

In [535]:
df[df.tokens_v2.apply(lambda text: all([word in text for word in user_keyword]))]

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
89,politics_20250327_4,2025-03-27,政治,75學者挺亞亞！律師翻出「3年前反戰聲明」　比對連署名單驚人結果曝光三立新聞網 setn.c...,政治中心／陳慈鈴報導【3/27 09:25 發稿｜09:50 更新：完整連署名單】▲亞亞遭限...,暫無,暫無,"[('台灣', 15), ('言論', 7), ('名單', 5), ('陳培哲', 5),...","['政治', '中心', '／', '陳慈鈴', '報導', '【3/27 09:25 ',...","['政治', '中心', '陳慈鈴', '名單', '亞亞', '松山', '機場', '中...","[NerToken(word='亞亞', ner='PERSON', idx=(42, 44...","[('政治', 'Na'), ('中心', 'Nc'), ('／', 'FW'), ('陳慈...",https://tw.news.yahoo.com//%E9%82%81%E9%98%BF%...,https://s.yimg.com/ny/api/res/1.2/wdF8W4i_dhn0...


In [536]:
len(df[df.tokens_v2.apply(lambda text: all([word in text for word in user_keyword]))])

1

### lambda: how does it work?
lambda is a function with only one line

In [537]:
func = lambda x: x+5  # parameters函數的參數   func(55, 175)  func(x) -> x+5

In [538]:
func(2)

7

In [539]:
numbers=[1,2,3]

In [540]:
[func(x) for x in numbers]

[6, 7, 8]

In [541]:
numbers=[1,2,3]
list(map(func, numbers))

[6, 7, 8]

In [542]:
numbers=[1,2,3]
list(map(lambda x: x+5, numbers))

[6, 7, 8]

In [543]:
def func2(x):
    return x+10

numbers=[1,2,3]
list(map(func2, numbers))

[11, 12, 13]

### How to use pandas apply()?

In [544]:
df_test = pd.DataFrame([1,2,3], columns=['number'])

In [545]:
df_test

Unnamed: 0,number
0,1
1,2
2,3


In [546]:
df_test.number.apply(lambda x: x+5) # add 5 for each element in df_test.number

0    6
1    7
2    8
Name: number, dtype: int64

In [547]:
user_keyword=['川普','總統']

In [548]:
df.tokens_v2.apply(lambda text: all([word in text for word in user_keyword]))

0       True
1       True
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: tokens_v2, Length: 216, dtype: bool

In [549]:
df.content.apply(lambda text: all([word in text for word in user_keyword]))

0       True
1       True
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: content, Length: 216, dtype: bool

In [550]:
user_keywords = ['川普','總統']
def func3(text): 
    return all([word in text for word in user_keyword])

df.content.apply(func3)

0       True
1       True
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: content, Length: 216, dtype: bool

# Function of fitering news

Put them all together!

In [551]:
# Searching keywords from "token_v2" column
def filter_dataFrame(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        query_df = df[(df.date >= start_date) & (df.date <= end_date) 
            & df.tokens_v2.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        query_df = df[(df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.tokens_v2.apply(lambda text: any((qk in text) for qk in user_keywords))]
    elif (cond == 'and'):
        query_df = df[(df.category == cate) 
            & (df.date >= start_date) & (df.date <= end_date) 
            & df.tokens_v2.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cond == 'or'):
        query_df = df[(df.category == cate) 
            & (df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.tokens_v2.apply(lambda text: any((qk in text) for qk in user_keywords))]
    return query_df

### For "politics" category 政治類

In [552]:
user_keyword=['川普','總統']
cond='and'
cate='政治'
weeks=1
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)

In [553]:
# end date: the date of the latest record of news
end_date = df.date.max()

# start date
start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')
start_date


'2025-03-20'

In [554]:
user_keyword=['川普','總統']
cond='and'
cate='政治'
weeks=1

In [555]:
user_keyword

['川普', '總統']

In [556]:
qk

'台灣'

In [557]:
(df.category == cate) & ( (df['date'] >= start_date) & (df['date'] <= end_date)  ) & df.tokens_v2.apply(lambda text: any((qk in text) for qk in user_keywords))

0      False
1      False
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Length: 216, dtype: bool

In [558]:
query_df = df[(
    df.category == cate)   # category
    & (df.date >= start_date) & (df.date <= end_date) # duration
    & df.tokens_v2.apply(lambda row: all((qk in row) for qk in user_keywords)) # user keywords
    ]
len(query_df)

1

In [559]:
query_df

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
90,politics_20250327_5,2025-03-27,政治,躲不過川普「汽車關稅」！新車價格喊漲　驚人數字曝光壹蘋新聞網2 則留言30 分鐘前,【國際中心／綜合報導】美國總統川普今天宣布對所有海外進口車輛一律課徵25%關稅，4月2日正式...,暫無,暫無,"[('汽車', 12), ('美國', 7), ('關稅', 6), ('日本', 4), ...","['【', '國際', '中心', '／', '綜合', '報導', '】', '美國', ...","['國際', '中心', '美國', '總統', '川普', '海外', '車輛', '關稅...","[NerToken(word='美國', ner='GPE', idx=(11, 13)),...","[('【', 'PARENTHESISCATEGORY'), ('國際', 'Nc'), (...",https://tw.news.yahoo.com//%E7%9A%87%E5%90%8E%...,https://s.yimg.com/ny/api/res/1.2/HJyhfoBkXApj...


### For "全部""All" category (不必篩選類別!)

In [560]:
user_keywords=['川普','美國']
cond='and'
cate='全部'
weeks=2

In [561]:
if (cate == "全部") & (cond == 'and'):
    query_df = df[
        (df.date >= start_date) & (df.date <= end_date) 
        & df.tokens_v2.apply(lambda row: all((qk in row) for qk in user_keywords))
        ]
elif (cate == "全部") & (cond == 'or'):
    query_df = df[
        (df['date'] >= start_date) & (df['date'] <= end_date) 
        & df.tokens_v2.apply(lambda row: any((qk in row) for qk in user_keywords))]

In [562]:
len(query_df)

32

# Function of count_keyword() to calculate frequency and occurence

In [563]:
# For the query_df, count the occurence and frequency for each category.

# (1) cate_occurence={}  被多少篇新聞報導
# (2) cate_freq={}       被提到多少次?How many times were the keywords mentioned


news_categories=['全部','焦點','娛樂影劇','國際','政治','社會地方','財經','運動','玩樂','品味','遊戲3C']

def count_keyword(query_df, user_keywords):
    cate_occurence={}
    cate_freq={}

    for cate in news_categories:
        cate_occurence[cate]=0
        cate_freq[cate]=0

    for idx, row in query_df.iterrows():
        # count number of news
        cate_occurence[row.category] += 1 # add 1 to its category's occurence
        cate_occurence['全部'] += 1
        
        # count user keyword frequency by checking every word in tokens_v2
        tokens = eval(row.tokens_v2) # count frequency
        freq =  len([word for word in tokens if (word in user_keywords)]) # how manay times?
        cate_freq[row.category] += freq
        cate_freq['全部'] += freq
        
    return cate_freq, cate_occurence

In [564]:
user_keywords=['川普','美國']
count_keyword(df_query, user_keywords)

({'全部': 9,
  '焦點': 0,
  '娛樂影劇': 0,
  '國際': 0,
  '政治': 9,
  '社會地方': 0,
  '財經': 0,
  '運動': 0,
  '玩樂': 0,
  '品味': 0,
  '遊戲3C': 0},
 {'全部': 1,
  '焦點': 0,
  '娛樂影劇': 0,
  '國際': 0,
  '政治': 1,
  '社會地方': 0,
  '財經': 0,
  '運動': 0,
  '玩樂': 0,
  '品味': 0,
  '遊戲3C': 0})

## Show it step by step by yourself

In [565]:
user_keywords=['川普','美國']
cond='and'
cate='全部'
weeks=2
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)

# count user keyword frequency by checking every word in tokens_v2
tokens = eval(df_query.tokens_v2[0]) # count frequency
freq =  len([word for word in tokens if (word in user_keywords)]) # how manay times?

In [566]:
df_query

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...
1,_20250327_2,2025-03-27,焦點,賴士葆撞人畫面外流 警被起訴,去年7月間，國民黨立委賴士葆駕車行駛在台北市大安區時，不慎撞傷走在斑馬線上2位女性行人，沒想...,暫無,暫無,"[('台灣', 10), ('賴士葆', 9), ('警員', 9), ('照片', 7),...","['去年', '7月', '間', '，', '國民黨', '立委', '賴士葆', '駕車...","['國民黨', '立委', '賴士葆', '台北市', '大安區', '斑馬線', '女性'...","[NerToken(word='去年7月', ner='DATE', idx=(0, 4))...","[('去年', 'Nd'), ('7月', 'Nd'), ('間', 'Ng'), ('，'...",https://tw.news.yahoo.com/%E8%B3%B4%E5%A3%AB%E...,https://s.yimg.com/ny/api/res/1.2/PPzJSVLdFBTm...
8,_20250327_9,2025-03-27,焦點,川普：對非美製汽車徵收25%關稅,美國總統川普26日宣布，將對所有在外國製造、進口到美國的車輛徵收25%關稅，於4月2日生效。...,暫無,暫無,"[('美國', 18), ('汽車', 15), ('關稅', 13), ('川普', 9)...","['美國', '總統', '川普', '26日', '宣布', '，', '將', '對',...","['美國', '總統', '川普', '外國', '美國', '車輛', '關稅', '川普...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('2...",https://tw.news.yahoo.com/%E5%B7%9D%E6%99%AE%E...,https://s.yimg.com/ny/api/res/1.2/a3q8Ypi2GpcT...
9,_20250327_10,2025-03-27,焦點,管制武統言論？法務部：有學者建議訂專法,大陸配偶「亞亞」劉振亞被內政部移民署認定在台發表「武統」言論，遭廢止其居留許可，驅逐出境。移...,暫無,暫無,"[('言論', 14), ('法務部', 10), ('刑法', 8), ('仇恨', 7)...","['大陸', '配偶', '「', '亞亞', '」', '劉振亞', '被', '內政部'...","['大陸', '配偶', '亞亞', '劉振亞', '內政部', '移民署', '武統', ...","[NerToken(word='大陸', ner='GPE', idx=(0, 2)), N...","[('大陸', 'Nc'), ('配偶', 'Na'), ('「', 'PARENTHESI...",https://tw.news.yahoo.com/%E6%B3%95%E5%8B%99%E...,https://s.yimg.com/ny/api/res/1.2/N.rOQJyRrtcu...
22,_20250327_23,2025-03-27,焦點,北京正在主導國際經貿體系！前貿易代表坦承：美國已生活在中國建構的世界裡,[周刊王CTWANT] 川普今年1月20日上台後隨即對鋼鐵和鋁材徵收高額關稅，並限制對中國的...,暫無,暫無,"[('中國', 47), ('美國', 40), ('經濟', 20), ('國家', 19...","['[周刊王', 'CTWANT', ']', ' ', '川普', '今年', '1月',...","['川普', '鋼鐵', '鋁材', '高額', '關稅', '中國', '投資', '歐巴...","[NerToken(word='川普', ner='PERSON', idx=(12, 14...","[('[周刊王', 'PARENTHESISCATEGORY'), ('CTWANT', '...",https://tw.news.yahoo.com/%E9%99%B3%E5%AD%9D%E...,https://s.yimg.com/ny/api/res/1.2/yCZUHuGPWx4S...
26,_20250327_27,2025-03-27,焦點,男子不停打嗝「持續2年多」！累到住院　血液檢查意外揪出病因,[周刊王CTWANT] 黎巴嫩一名93歲男子，從2年前開始就出現不斷打嗝的情況，不管嘗試什麼...,暫無,暫無,"[('男子', 12), ('網友', 10), ('情況', 8), ('女兒', 5),...","['[', '周刊王', 'CTWANT', ']', ' ', '黎巴嫩', '一名', ...","['周刊王', '黎巴嫩', '男子', '情況', '情況', '醫院', '血液', '...","[NerToken(word='黎巴嫩', ner='GPE', idx=(12, 15))...","[('[', 'PARENTHESISCATEGORY'), ('周刊王', 'Nb'), ...",https://tw.news.yahoo.com/%E6%90%AD%E9%A3%9B%E...,https://s.yimg.com/ny/api/res/1.2/MIlRskTBfzno...
33,_20250327_34,2025-03-27,焦點,No title,美國政府的財政狀況持續惡化，預算赤字擴大、國債規模不斷攀升。國際信評機構穆迪在25日發布的報...,暫無,暫無,"[('美國', 14), ('財政', 12), ('穆迪', 8), ('政府', 7),...","['美國', '政府', '的', '財政', '狀況', '持續', '惡化', '，',...","['美國', '政府', '財政', '狀況', '預算', '赤字', '國債', '規模...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('政府', 'Na'), ('的', 'DE'), ('財政...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/Snpin_i4HNIQ...
35,_20250327_36,2025-03-27,焦點,信評機構警告美國財政惡化　利息支出占收入的比率將從9％升至30％,美國政府的財政狀況持續惡化，預算赤字擴大、國債規模不斷攀升。國際信評機構穆迪在25日發布的報...,暫無,暫無,"[('美國', 14), ('財政', 12), ('穆迪', 8), ('政府', 7),...","['美國', '政府', '的', '財政', '狀況', '持續', '惡化', '，',...","['美國', '政府', '財政', '狀況', '預算', '赤字', '國債', '規模...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('政府', 'Na'), ('的', 'DE'), ('財政...",https://tw.news.yahoo.com//%E9%BB%83%E5%AD%90%...,https://s.yimg.com/ny/api/res/1.2/Snpin_i4HNIQ...
36,_20250327_37,2025-03-27,焦點,保健食品關稅30%　財長：部分分年逐步調降,[NOWnews今日新聞] 立法院財政委員會今日邀請財政部、經濟部就「因應美國川普政府對等關...,暫無,暫無,"[('關稅', 12), ('經濟部', 6), ('保健', 6), ('食品', 6),...","['[NO', 'Wnews', '今日', '新聞', ']', ' ', '立法院', ...","['新聞', '立法院', '財政', '委員會', '財政部', '經濟部', '美國',...","[NerToken(word='立法院財政委員會', ner='ORG', idx=(14,...","[('[NO', 'PARENTHESISCATEGORY'), ('Wnews', 'FW...",https://tw.news.yahoo.com//%E4%BF%A1%E8%A9%95%...,https://s.yimg.com/ny/api/res/1.2/xYsfze59VEId...
40,_20250327_41,2025-03-27,焦點,美國經濟衰退危機將至？巴菲特示警散戶快做5件事：當別人貪婪時要恐懼,美國總統川普上任後接連祭出關稅政策，讓全球經濟陷入重大隱憂，甚至有不少華爾街專家認為，美國經...,暫無,暫無,"[('經濟', 13), ('巴菲特', 6), ('美國', 4), ('投資人', 4)...","['美國', '總統', '川普', '上任', '後', '接連', '祭出', '關稅'...","['美國', '總統', '川普', '關稅', '政策', '全球', '經濟', '隱憂...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('上...",https://tw.news.yahoo.com/%E5%8F%B0%E5%8D%97%E...,https://s.yimg.com/ny/api/res/1.2/eaVvc8.Hhr6W...


In [567]:
df_query.iloc[0]

item_id                                               _20250327_1
date                                                   2025-03-27
category                                                       焦點
title                                          台股重挫308點 失守22000關卡
content         美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...
sentiment                                                      暫無
summary                                                        暫無
top_key_freq    [('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...
tokens          ['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...
tokens_v2       ['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...
entities        [NerToken(word='美國', ner='GPE', idx=(0, 2)), N...
token_pos       [('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...
link            https://tw.news.yahoo.com/https://tw.stock.yah...
photo_link      https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...
Name: 0, dtype: object

In [568]:
row = df_query.iloc[0]

In [569]:
row.tokens_v2

"['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積電', '台積電', '台北', '現股', '早盤', '終場', '台股', '關卡', '川普', '美國', '汽車', '進口稅', '美股', '指數', '費城', '半導體', '指數', '台積電', '台幣', '台北', '交易', '股票', '溢價率', '台股', '關卡', '台積電', '終場', '凱基投顧', '報告', '周三', '大盤', '高點', '月線', '壓力', '買盤', '成交量', '水準', '線未果', '盤面上漲家數', '跌家數', '籌碼', '均線', '技術面', '月線', '月線', '機會', '月13日', '型態', '走勢', '凱基', '盤面', '類股', '大盤', '型態', '個股', '題材', '股價', '均線', '壓力', '強勢', '個股']"

In [570]:
eval(row.tokens_v2)

['美國',
 '總統',
 '川普',
 '汽車',
 '關稅',
 '美股',
 '指數',
 '台積電',
 '台積電',
 '台北',
 '現股',
 '早盤',
 '終場',
 '台股',
 '關卡',
 '川普',
 '美國',
 '汽車',
 '進口稅',
 '美股',
 '指數',
 '費城',
 '半導體',
 '指數',
 '台積電',
 '台幣',
 '台北',
 '交易',
 '股票',
 '溢價率',
 '台股',
 '關卡',
 '台積電',
 '終場',
 '凱基投顧',
 '報告',
 '周三',
 '大盤',
 '高點',
 '月線',
 '壓力',
 '買盤',
 '成交量',
 '水準',
 '線未果',
 '盤面上漲家數',
 '跌家數',
 '籌碼',
 '均線',
 '技術面',
 '月線',
 '月線',
 '機會',
 '月13日',
 '型態',
 '走勢',
 '凱基',
 '盤面',
 '類股',
 '大盤',
 '型態',
 '個股',
 '題材',
 '股價',
 '均線',
 '壓力',
 '強勢',
 '個股']

In [571]:
tokens

['美國',
 '總統',
 '川普',
 '汽車',
 '關稅',
 '美股',
 '指數',
 '台積電',
 '台積電',
 '台北',
 '現股',
 '早盤',
 '終場',
 '台股',
 '關卡',
 '川普',
 '美國',
 '汽車',
 '進口稅',
 '美股',
 '指數',
 '費城',
 '半導體',
 '指數',
 '台積電',
 '台幣',
 '台北',
 '交易',
 '股票',
 '溢價率',
 '台股',
 '關卡',
 '台積電',
 '終場',
 '凱基投顧',
 '報告',
 '周三',
 '大盤',
 '高點',
 '月線',
 '壓力',
 '買盤',
 '成交量',
 '水準',
 '線未果',
 '盤面上漲家數',
 '跌家數',
 '籌碼',
 '均線',
 '技術面',
 '月線',
 '月線',
 '機會',
 '月13日',
 '型態',
 '走勢',
 '凱基',
 '盤面',
 '類股',
 '大盤',
 '型態',
 '個股',
 '題材',
 '股價',
 '均線',
 '壓力',
 '強勢',
 '個股']

In [572]:
[word for word in tokens if (word in user_keywords)]

['美國', '川普', '川普', '美國']

In [573]:
len([word for word in tokens if (word in user_keywords)])

4

In [574]:
freq

4

In [575]:
cate_occurence={}
cate_freq={}

In [576]:
cate_occurence

{}

In [577]:
for cate in news_categories:
    cate_occurence[cate]=0
    cate_freq[cate]=0

In [578]:
cate_occurence

{'全部': 0,
 '焦點': 0,
 '娛樂影劇': 0,
 '國際': 0,
 '政治': 0,
 '社會地方': 0,
 '財經': 0,
 '運動': 0,
 '玩樂': 0,
 '品味': 0,
 '遊戲3C': 0}

In [579]:
cate_freq

{'全部': 0,
 '焦點': 0,
 '娛樂影劇': 0,
 '國際': 0,
 '政治': 0,
 '社會地方': 0,
 '財經': 0,
 '運動': 0,
 '玩樂': 0,
 '品味': 0,
 '遊戲3C': 0}

In [580]:
cate_freq[row.category] += freq
cate_freq['全部'] += freq

In [581]:
cate_freq

{'全部': 4,
 '焦點': 4,
 '娛樂影劇': 0,
 '國際': 0,
 '政治': 0,
 '社會地方': 0,
 '財經': 0,
 '運動': 0,
 '玩樂': 0,
 '品味': 0,
 '遊戲3C': 0}

In [582]:
# count number of news
cate_occurence[row.category] += 1 # add 1 to its category's occurence
cate_occurence['全部'] += 1

In [583]:
cate_occurence

{'全部': 1,
 '焦點': 1,
 '娛樂影劇': 0,
 '國際': 0,
 '政治': 0,
 '社會地方': 0,
 '財經': 0,
 '運動': 0,
 '玩樂': 0,
 '品味': 0,
 '遊戲3C': 0}

## Now we can conveniently call those functions

In [584]:
user_keywords=['川普','美國']
cond='or'
cate='全部'
weeks=2
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)

In [585]:
len(df_query)

67

In [586]:
count_keyword(df_query, user_keywords)

({'全部': 422,
  '焦點': 156,
  '娛樂影劇': 5,
  '國際': 86,
  '政治': 14,
  '社會地方': 14,
  '財經': 103,
  '運動': 19,
  '玩樂': 8,
  '品味': 4,
  '遊戲3C': 13},
 {'全部': 67,
  '焦點': 22,
  '娛樂影劇': 3,
  '國際': 10,
  '政治': 3,
  '社會地方': 2,
  '財經': 9,
  '運動': 8,
  '玩樂': 2,
  '品味': 3,
  '遊戲3C': 5})

In [587]:
user_keywords=['川普','美國']
cond='and'
cate='政治'
weeks=2
df_query = filter_dataFrame(user_keywords, cond, cate, weeks)

In [588]:
len(df_query)

1

In [589]:
count_keyword(df_query, user_keyword)

({'全部': 3,
  '焦點': 0,
  '娛樂影劇': 0,
  '國際': 0,
  '政治': 3,
  '社會地方': 0,
  '財經': 0,
  '運動': 0,
  '玩樂': 0,
  '品味': 0,
  '遊戲3C': 0},
 {'全部': 1,
  '焦點': 0,
  '娛樂影劇': 0,
  '國際': 0,
  '政治': 1,
  '社會地方': 0,
  '財經': 0,
  '運動': 0,
  '玩樂': 0,
  '品味': 0,
  '遊戲3C': 0})

## Usage of apply, map (For reference)

In [590]:
# apply usage
import pandas as pd

sample_df = pd.DataFrame({
    'Col 1': [3,4,5,6],
    'Col 2': [2,3,6,4],
    'Col 3': [8,8,9,8],

},index=["A","B","C","D"])
sample_df

Unnamed: 0,Col 1,Col 2,Col 3
A,3,2,8
B,4,3,8
C,5,6,9
D,6,4,8


In [591]:
sample_df=sample_df.apply(lambda x: x+10)
sample_df

Unnamed: 0,Col 1,Col 2,Col 3
A,13,12,18
B,14,13,18
C,15,16,19
D,16,14,18


In [592]:
sample_df["Col 1"]=sample_df["Col 1"].apply(lambda x: x-10)
sample_df

Unnamed: 0,Col 1,Col 2,Col 3
A,3,12,18
B,4,13,18
C,5,16,19
D,6,14,18
