In [2]:
import pandas as pd
from pytrends.request import TrendReq
import logging
import time

In [43]:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger('urllib3.connectionpool').setLevel(logging.ERROR)

In [44]:
def series_of_list_of_keywords_to_single_list(data):
    return [element for list_ in data for element in list_]

In [45]:
# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller chunks
def split_dataframe(_df, chunk_size=5):
    size_df = len(_df)
    if size_df < chunk_size:
        logging.info("trending: split_dataframe(): df size is smaller than chunk_size.")
        logging.info(f"trending: split_dataframe(): size_df = {size_df}, chunk_size = {chunk_size}")
        return [_df]
    chunks = list()

    # Fancy way 的無條件進位法
    num_chunks = -(-size_df // chunk_size)
    logging.info(f"trending: split_dataframe(): Split into {num_chunks} chunks.")

    for i in range(num_chunks):
        chunks.append(_df[i * chunk_size:(i + 1) * chunk_size])
    return chunks


In [46]:

def google_trends(keywords: list, num: int = 1, keep_duplicate_only: bool = True):
    if keep_duplicate_only:
        _keywords = keywords_preprocess(keywords)
    else:
        _keywords = keywords
    print("len(_keywords): "+str(len(_keywords)))
    while len(_keywords) > num:
        logging.info(f"trending: google_trends(): Dealing with {len(_keywords)} keywords.")
        chunks = split_dataframe(_keywords)
        _keywords = looping(chunks)
        logging.info("----------------------------------------")
        logging.info(f"Keywords: {str(_keywords)}")
        logging.info("----------------------------------------")



    return _keywords


In [47]:
def keywords_preprocess(keywords):
    logging.info(f"trending: keywords_preprocess(): {len(keywords)} unprocessed keywords")
    kw_series = pd.Series(keywords)

    kw_series = kw_series[kw_series.duplicated()]

    kw_series.drop_duplicates(keep="first", inplace=True)
    kw_series.reset_index(drop=True, inplace=True)

    return kw_series.tolist()


In [48]:

def looping(chunks: list):

    list2 = []

    print(chunks)
    logging.info(f"trending: looping()")
    chunks_size = len(chunks)
    logging.info(f"{chunks_size}")
    logging.info(f"{str(chunks)}")


    i = 0
    for chunk in chunks:
        i += 1
        result_series = run(chunk)
        logging.info(f"trending: looping(chunk{i}/{chunks_size}): series: \n{str(result_series)}")
        logging.info(f"trending: looping(chunk{i}/{chunks_size}): max is {result_series.idxmax()}")
        logging.info(f"trending: looping(chunk{i}/{chunks_size}): ===========================")
        list2.append(result_series.idxmax())

    return list2


In [52]:
# ok
def run(_keywords: list, sort: bool = False) -> pd.Series:
    pytrends = TrendReq(hl='zn-TW', tz=-480)

    if len(_keywords) > 5:
        raise ValueError("length of keyword list must be less than 5")

    try:
        pytrends.build_payload(_keywords, cat=0, timeframe='2022-03-28 2022-04-04', geo='TW', gprop='')
        df = pytrends.interest_over_time()
        print(df)
    except ConnectionError as e:
        logging.warning("ConnectionError")
        logging.warning(str(e))
        logging.warning(str(e.__traceback__))
        time.sleep(60)
        pytrends.build_payload(_keywords, cat=0, timeframe='today 1-m', geo='TW', gprop='')
        df = pytrends.interest_over_time()


    # 依照日期排序
    df.sort_values("date", ascending=False, inplace=True)

    # 刪除統計未完整資料
    # 只保留 isPartial 為 False 的
    # 通常刪除最近一天的資料
    df = df.loc[(df['isPartial'] == False)]

    # 取近七天的資訊
    df = df[:7]

    # 加總
    ranking_series = df.sum()

    # 刪除不需要的欄位
    ranking_series.drop('isPartial', inplace=True)

    if sort:
        ranking_series.sort_values(ascending=False, inplace=True)

    return ranking_series

In [50]:
# read json

database = pd.read_json("../../assets/json/keyword/test444_keyword.json")
database = database.transpose()

# get keywords from database
keywords = database["keywords"]
keywords = series_of_list_of_keywords_to_single_list(keywords)

In [26]:
timestamp = database["date"][0]



In [42]:
ts = pd.Timestamp(timestamp, unit='s', tz='Asia/Taipei')
print(ts.strftime('%Y-%m-%d'))

ts2 = ts-pd.Timedelta(days=7)
print(ts2.strftime('%Y-%m-%d'))

'''YYYY-MM-DD YYYY-MM-DD'''

2022-04-04
2022-03-28


'YYYY-MM-DD YYYY-MM-DD'

In [39]:
pd.Timedelta(days=7)

Timedelta('7 days 00:00:00')

In [10]:
keywords_short = keywords[:20]
keywords_short

['措施',
 '觀光客',
 '疫情',
 '政府',
 '主管',
 '疫苗',
 '月',
 '民眾',
 '疾病',
 '程度',
 '查維斯',
 '費蓋雷斯',
 '醜聞',
 '經濟',
 '總統',
 '威力彩',
 '頭獎',
 '額外',
 '資訊',
 '台彩']

In [53]:
kwtest = ['疫苗', '平台', '月', '科技', '價格', '電信', '台積電', '城', '股價', '資訊', '交易', '股票', '基金', '政策', '特斯拉', '社交', '國家', '汽車', '美元', '電子', '稅', '房', '股東', '油價', '房子', '車', '卡', '高鐵']
result = google_trends(kwtest, keep_duplicate_only=False)

INFO:root:trending: google_trends(): Dealing with 28 keywords.
INFO:root:trending: split_dataframe(): Split into 6 chunks.
INFO:root:trending: looping()
INFO:root:6
INFO:root:[['疫苗', '平台', '月', '科技', '價格'], ['電信', '台積電', '城', '股價', '資訊'], ['交易', '股票', '基金', '政策', '特斯拉'], ['社交', '國家', '汽車', '美元', '電子'], ['稅', '房', '股東', '油價', '房子'], ['車', '卡', '高鐵']]


len(_keywords): 28
[['疫苗', '平台', '月', '科技', '價格'], ['電信', '台積電', '城', '股價', '資訊'], ['交易', '股票', '基金', '政策', '特斯拉'], ['社交', '國家', '汽車', '美元', '電子'], ['稅', '房', '股東', '油價', '房子'], ['車', '卡', '高鐵']]


INFO:root:trending: looping(chunk1/6): series: 
疫苗    247
平台     72
月     562
科技    218
價格    356
dtype: int64
INFO:root:trending: looping(chunk1/6): max is 月


            疫苗  平台    月  科技  價格  isPartial
date                                      
2022-03-28  38  12   74  37  52      False
2022-03-29  34  12   65  40  50      False
2022-03-30  35  12   72  43  51      False
2022-03-31  40  12   71  45  50      False
2022-04-01  40  10   77  36  52      False
2022-04-02  33   8  100  20  50      False
2022-04-03  29   8   90  17  51      False
2022-04-04  36  10   87  17  52      False


INFO:root:trending: looping(chunk2/6): series: 
電信      91
台積電    110
城      278
股價     446
資訊      71
dtype: int64
INFO:root:trending: looping(chunk2/6): max is 股價


            電信  台積電   城   股價  資訊  isPartial
date                                       
2022-03-28  14   23  34   95  13      False
2022-03-29  14   22  29   92  13      False
2022-03-30  14   25  30  100  14      False
2022-03-31  14   22  37   96  12      False
2022-04-01  15   23  37   93  12      False
2022-04-02  12    7  44   23   7      False
2022-04-03  11    5  53   16   6      False
2022-04-04  11    6  48   26   7      False


INFO:root:trending: looping(chunk3/6): series: 
交易     140
股票     464
基金     254
政策      34
特斯拉    111
dtype: int64
INFO:root:trending: looping(chunk3/6): max is 股票


            交易   股票  基金  政策  特斯拉  isPartial
date                                       
2022-03-28  20   92  43   6   19      False
2022-03-29  25  100  52   5   21      False
2022-03-30  24   96  47   7   21      False
2022-03-31  25   86  46   6   16      False
2022-04-01  20   90  44   5   18      False
2022-04-02  21   32  26   3   10      False
2022-04-03  12   31  18   4   10      False
2022-04-04  13   29  21   4   15      False


INFO:root:trending: looping(chunk4/6): series: 
社交     12
國家    188
汽車    593
美元    205
電子    590
dtype: int64
INFO:root:trending: looping(chunk4/6): max is 汽車


            社交  國家  汽車  美元   電子  isPartial
date                                      
2022-03-28   2  30  77  48   93      False
2022-03-29   1  26  79  44   85      False
2022-03-30   2  28  81  39   97      False
2022-03-31   2  27  79  37  100      False
2022-04-01   1  27  84  38   92      False
2022-04-02   2  24  94  14   72      False
2022-04-03   3  29  87  13   72      False
2022-04-04   1  27  89  20   72      False


INFO:root:trending: looping(chunk5/6): series: 
稅     529
房     320
股東    117
油價    353
房子    146
dtype: int64
INFO:root:trending: looping(chunk5/6): max is 稅


              稅   房  股東  油價  房子  isPartial
date                                      
2022-03-28   89  50  16  31  23      False
2022-03-29  100  45  20  26  21      False
2022-03-30  100  48  26  34  23      False
2022-03-31   88  46  22  42  16      False
2022-04-01   82  35  15  59  16      False
2022-04-02   56  49  14  65  26      False
2022-04-03   56  48  10  93  21      False
2022-04-04   47  49  10  34  23      False


INFO:root:trending: looping(chunk6/6): series: 
車     496
卡     474
高鐵    259
dtype: int64
INFO:root:trending: looping(chunk6/6): max is 車
INFO:root:----------------------------------------
INFO:root:Keywords: ['月', '股價', '股票', '汽車', '稅', '車']
INFO:root:----------------------------------------
INFO:root:trending: google_trends(): Dealing with 6 keywords.
INFO:root:trending: split_dataframe(): Split into 2 chunks.
INFO:root:trending: looping()
INFO:root:2
INFO:root:[['月', '股價', '股票', '汽車', '稅'], ['車']]


             車   卡   高鐵  isPartial
date                              
2022-03-28  64  65   22      False
2022-03-29  67  61   23      False
2022-03-30  70  69   24      False
2022-03-31  67  67   27      False
2022-04-01  68  68  100      False
2022-04-02  73  73   37      False
2022-04-03  71  65   25      False
2022-04-04  80  71   23      False
[['月', '股價', '股票', '汽車', '稅'], ['車']]


INFO:root:trending: looping(chunk1/2): series: 
月     478
股價    446
股票    133
汽車    163
稅      95
dtype: int64
INFO:root:trending: looping(chunk1/2): max is 月


             月   股價  股票  汽車   稅  isPartial
date                                      
2022-03-28  63   95  26  21  16      False
2022-03-29  56   92  29  22  18      False
2022-03-30  61  100  27  22  18      False
2022-03-31  60   96  25  22  16      False
2022-04-01  66   93  26  23  15      False
2022-04-02  85   23   9  26  10      False
2022-04-03  76   16   9  24  10      False
2022-04-04  74   26   8  24   8      False


INFO:root:trending: looping(chunk2/2): series: 
車    618
dtype: int64
INFO:root:trending: looping(chunk2/2): max is 車
INFO:root:----------------------------------------
INFO:root:Keywords: ['月', '車']
INFO:root:----------------------------------------
INFO:root:trending: google_trends(): Dealing with 2 keywords.
INFO:root:trending: split_dataframe(): df size is smaller than chunk_size.
INFO:root:trending: split_dataframe(): size_df = 2, chunk_size = 5
INFO:root:trending: looping()
INFO:root:1
INFO:root:[['月', '車']]


              車  isPartial
date                      
2022-03-28   79      False
2022-03-29   83      False
2022-03-30   88      False
2022-03-31   83      False
2022-04-01   85      False
2022-04-02   91      False
2022-04-03   88      False
2022-04-04  100      False
[['月', '車']]


INFO:root:trending: looping(chunk1/1): series: 
月    562
車    481
dtype: int64
INFO:root:trending: looping(chunk1/1): max is 月
INFO:root:----------------------------------------
INFO:root:Keywords: ['月']
INFO:root:----------------------------------------


              月   車  isPartial
date                          
2022-03-28   74  62      False
2022-03-29   65  64      False
2022-03-30   72  68      False
2022-03-31   71  65      False
2022-04-01   77  66      False
2022-04-02  100  71      False
2022-04-03   90  69      False
2022-04-04   87  78      False


In [54]:
result

['月']

In [102]:
test_series = run(keywords_short[:5])

In [93]:
print(str(test_series))

措施      6
優惠    151
疫情    296
觀光     27
實施      5
dtype: int64


ValueError: Shape of passed values is (5, 1), indices imply (5, 2)

In [56]:

print(
    f"{'Trades:':<15}{123:>10}",
    f"\n{'Wi中文s:':<15}{12345:>10}",
    f"\n{'Losses:':<15}{1:>10}",
    f"\n{'Breakeven:':<15}{46354:>10}",
    f"\n{'Win/Loss Ratio:':<15}{4564546:>10}",
    f"\n{'Mean Win:':<15}{45.2:>10}",

)

Trades:               123 
Wi中文s:              12345 
Losses:                 1 
Breakeven:          46354 
Win/Loss Ratio:   4564546 
Mean Win:            45.2
