# Pandas supports full text search for Chinese and English.

pandas supports full text search for both English and Chinese

pandas can search news content which is not tokenized.

You can input any text and pandas will search it for you. That's awesome!

# Load preprocessed news dataset

In [41]:
import pandas as pd
from datetime import datetime

In [42]:
df = pd.read_csv('./news.csv',sep='|')

In [43]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,_20250327_1,2025-03-27,焦點,台股重挫308點 失守22000關卡,美國總統川普準備徵收汽車關稅，美股主要指數全數下跌，台積電重挫4.09%，台積電台北現股今日...,暫無,暫無,"[('台積電', 4), ('指數', 3), ('月線', 3), ('美國', 2), ...","['美國', '總統', '川普', '準備', '徵收', '汽車', '關稅', '，'...","['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...","[NerToken(word='美國', ner='GPE', idx=(0, 2)), N...","[('美國', 'Nc'), ('總統', 'Na'), ('川普', 'Nb'), ('準...",https://tw.news.yahoo.com/https://tw.stock.yah...,https://s.yimg.com/ny/api/res/1.2/3qEveVGKp070...


# Pandas contains()

In [44]:
df['tokens_v2']

0      ['美國', '總統', '川普', '汽車', '關稅', '美股', '指數', '台積...
1      ['國民黨', '立委', '賴士葆', '台北市', '大安區', '斑馬線', '女性'...
2      ['台灣祭', '民眾', '社群', '平台', '活動', '期間', '墾丁', '地...
3      ['海軍', '中和', '軍艦', '台中港', '外海', '中國籍', '漁船', '...
4      ['北一女中', '教師', '區桂芝', '大陸', '官媒', '總統', '賴清德',...
                             ...                        
211    ['三星', '平板', '消息', '諜照', '厚度', '圖裡', '螢幕', '尺寸...
212    ['會員', '遊戲', '陣容', '作品', '機器', '戰警', '惡棍', '城市...
213    ['神話組', '生活', '新作', '角色', '功能', '動作', '觀眾', '觀...
214    ['日期', '主角', '爆料', '設計', '情報', '短片', '視覺', '風格...
215    ['火報', '記者', '陳銳', '路透', '美國', '技術', '團隊', '核心...
Name: tokens_v2, Length: 216, dtype: object

In [45]:
type(df['tokens_v2'])

pandas.core.series.Series

In [46]:
type(df['tokens_v2'][0])

str

In [47]:
df['tokens_v2'].str

<pandas.core.strings.accessor.StringMethods at 0x1ef82e35520>

In [48]:
df['tokens_v2'].str.contains("台股")

0       True
1      False
2      False
3      False
4      False
       ...  
211    False
212    False
213    False
214    False
215    False
Name: tokens_v2, Length: 216, dtype: bool

In [49]:
len(df[df.tokens_v2.str.contains('台股')])

13

In [50]:
df['tokens_v2'].str.contains('台灣')

0      False
1       True
2       True
3       True
4       True
       ...  
211    False
212    False
213    False
214    False
215    False
Name: tokens_v2, Length: 216, dtype: bool

In [51]:
len(df[df['tokens_v2'].str.contains('台灣')])

106

In [52]:
df[df['tokens_v2'].str.contains('台灣')].head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
1,_20250327_2,2025-03-27,焦點,賴士葆撞人畫面外流 警被起訴,去年7月間，國民黨立委賴士葆駕車行駛在台北市大安區時，不慎撞傷走在斑馬線上2位女性行人，沒想...,暫無,暫無,"[('台灣', 10), ('賴士葆', 9), ('警員', 9), ('照片', 7),...","['去年', '7月', '間', '，', '國民黨', '立委', '賴士葆', '駕車...","['國民黨', '立委', '賴士葆', '台北市', '大安區', '斑馬線', '女性'...","[NerToken(word='去年7月', ner='DATE', idx=(0, 4))...","[('去年', 'Nd'), ('7月', 'Nd'), ('間', 'Ng'), ('，'...",https://tw.news.yahoo.com/%E8%B3%B4%E5%A3%AB%E...,https://s.yimg.com/ny/api/res/1.2/PPzJSVLdFBTm...


## Use query() to get the same result

In [53]:
# Use query() to get the same result
df_query = df.query("tokens_v2.str.contains('台灣')")

In [54]:
len(df_query)

106

# And OR condiction: multiple user keywords
## or condition

    in pandas, use '|'  don't use 'or'

In [55]:
# approach 1
df_query = df[df['tokens_v2'].str.contains('美國|台灣')]
len(df_query)

132

In [56]:
# approach 2
queryKey=['美國','台灣']
# This is more flexible.
df_query = df[df['tokens_v2'].str.contains('|'.join(queryKey))]
len(df_query)

132

In [57]:
# approach 3
df_query = df[(df.tokens_v2.str.contains('美國')) | (df.tokens_v2.str.contains('台灣'))]
len(df_query)

132

In [58]:
queryKey=['美國','台灣']
condstring = '|'.join('(?=.*{})'.format(word) for word in queryKey)
condstring

'(?=.*美國)|(?=.*台灣)'

In [59]:

df_query = df[df.content.str.contains(condstring, regex=True)]
len(df_query)

132

## and condition

    in pandas, use '&'  don't use 'and'

    Why can't we use the 'and' operator in pandas?
    In pandas the & operater is overrode (功能取代) to perform logical operation item by item.
    Therefore, we should use the '&' operator, instead of the 'and' operator.
    Please refer to the following link:
    https://stackoverflow.com/questions/21415661/logical-operators-for-boolean-indexing-in-pandas

    For example, if there are 218 items in df, and in the following command, the operator '&' will performs 218 logical operations for us. The python 'and' operator fails to do this, because python 'and' operator cannot be overrode.

    (df.tokens_v2.str.contains('烏克蘭')) & (df.tokens_v2.str.contains('台灣'))


In [60]:
df_query = df[(df.content.str.contains('美國'))&(df.content.str.contains('台灣'))]
len(df_query)

36

In [61]:
df_query = df[df.content.str.contains(r'(?=.*美國)(?=.*台灣)')]
len(df_query)

36

In [62]:
df_query = df[df.content.str.contains('(?=.*美國)(?=.*台灣)')]
len(df_query)

36

In [63]:
df_query = df[df.content.str.contains('(?=.*台灣)(?=.*美國)')]
len(df_query)

36

In [64]:
df_query = df[df.content.str.contains('(美國)(台灣)')] # 不管用
len(df_query)

  df_query = df[df.content.str.contains('(美國)(台灣)')] # 不管用


0

In [65]:
df_query = df[df.content.str.contains('(美國) & (台灣)')]# 沒有&的用法!!??
len(df_query)

  df_query = df[df.content.str.contains('(美國) & (台灣)')]# 沒有&的用法!!??


0

In [66]:
user_keywords = ['捐贈烏克蘭','外交部']
''.join('(?=.*{})'.format(word) for word in user_keywords)

'(?=.*捐贈烏克蘭)(?=.*外交部)'

# Filter data using the following function

In [67]:
from datetime import datetime, timedelta
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # duration期間條件
    base_condition = (df.date >= start_date) & (df.date <= end_date) 
    
    # proceed filtering
    if (cate == "全部"):
        condition = base_condition # "全部"類別不必過濾新聞種類
    else:
        # category新聞類別條件
        condition = base_condition & (df.category == cate) 

    if (cond == 'and'):
        # query keywords condition使用者輸入關鍵字條件and
        condition = condition & df.content.str.contains(''.join('(?=.*{})'.format(word) for word in user_keywords)) 
    elif (cond == 'or'):
        # query keywords condition使用者輸入關鍵字條件
        condition = condition & df.content.str.contains('|'.join(user_keywords))
    # condiction is a list of True or False boolean value
    df_query = df[condition]

    return df_query

In [68]:
from datetime import datetime, timedelta
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText_v0(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) 
            & df.content.str.contains(''.join('(?=.*{})'.format(word) for word in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.str.contains('|'.join(user_keywords))]
            
    elif (cond == 'and'):
        df_query = df[(df.category == cate) 
            & (df.date >= start_date) & (df.date <= end_date) 
            & df.content.str.contains(''.join('(?=.*{})'.format(word) for word in user_keywords))]
    elif (cond == 'or'):
        df_query = df[(df.category == cate) 
            & (df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.str.contains('|'.join(user_keywords))]

    return df_query

In [69]:
from datetime import datetime, timedelta
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText_previous(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # duration期間條件
    base_condition = (df.date >= start_date) & (df.date <= end_date) 
    
    # proceed filtering
    if (cate == "全部"):
        condition = base_condition # "全部"類別不必過濾新聞種類
    else:
        # category新聞類別條件
        condition = base_condition & (df.category == cate) 

    if (cond == 'and'):
        # query keywords condition使用者輸入關鍵字條件and
        # condition = condition & df.content.str.contains(''.join('(?=.*{})'.format(word) for word in user_keywords))# 寫法1: contains() 
        condition = condition & df.content.apply(lambda text: all((qk in text) for qk in user_keywords)) # 寫法2:all()
    elif (cond == 'or'):
        # query keywords condition使用者輸入關鍵字條件
        # condition = condition & df.content.str.contains('|'.join(user_keywords)) # 寫法1: contains()
        condition = condition & df.content.apply(lambda text: any((qk in text) for qk in user_keywords)) # 寫法2:any()
    # condiction is a list of True or False boolean value
    df_query = df[condition]

    return df_query

In [70]:
# Searching keywords from "content" column
# Here this function uses df.content column, while filter_dataFrame() uses df.tokens_v2
def filter_dataFrame_fullText_previous_v0(user_keywords, cond, cate, weeks):

    # end date: the date of the latest record of news
    end_date = df.date.max()
    
    # start date
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    # proceed filtering
    if (cate == "全部") & (cond == 'and'):
        df_query = df[(df.date >= start_date) & (df.date <= end_date) 
            & df.content.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cate == "全部") & (cond == 'or'):
        df_query = df[(df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.apply(lambda text: any((qk in text) for qk in user_keywords))]
    elif (cond == 'and'):
        df_query = df[(df.category == cate) 
            & (df.date >= start_date) & (df.date <= end_date) 
            & df.content.apply(lambda text: all((qk in text) for qk in user_keywords))]
    elif (cond == 'or'):
        df_query = df[(df.category == cate) 
            & (df['date'] >= start_date) & (df['date'] <= end_date) 
            & df.content.apply(lambda text: any((qk in text) for qk in user_keywords))]

    return df_query

In [74]:
user_keywords = ['美國總統','股市']
cond='and'
cate='全部'
weeks=2

df_query = filter_dataFrame_fullText(user_keywords, cond, cate,weeks)
df_query.shape

(4, 14)

In [75]:
user_keywords = ['美國']
cond='and'
cate='全部'
weeks=2

df_query = filter_dataFrame_fullText(user_keywords, cond, cate,weeks)
df_query.shape

(62, 14)