## 1. Read Key Data

In [23]:
import os

dir_path = './keys/'
all_files = os.listdir(dir_path)

key_dic = {}

for file in all_files:
    file_path = os.path.join(dir_path, file)
    with open(file_path, "r") as f:
        filename = file_path.split('/')[-1]
        category = filename.split('-')[0]
        tag = filename.split('-')[1][:-4]
        
        keys = f.readlines()
        keys = list(map(lambda x: x.strip(), keys))

        for key in keys:
            k = key.lower()
            if k not in key_dic:
                key_dic[k] = {"cnt" : 1, "category" : {category}, "original" : {key}, "tags" : {tag}}
            else:
                key_dic[k]["cnt"] += 1
                key_dic[k]["category"].add(category)
                key_dic[k]["original"].add(key)
                key_dic[k]["tags"].add(tag)

In [15]:
from copy import deepcopy
old_key_dic = deepcopy(key_dic)

In [58]:
len(old_key_dic)

14870

## 2. Read from news data

In [24]:
import os
from tqdm import tqdm

dir_path = './docsutf8/'
all_files = os.listdir(dir_path)

for file in tqdm(all_files):
    file_path = os.path.join(dir_path, file)
    with open(file_path, "r") as f:
        filename = file_path.split('/')[-1]
        category = filename.split('-')[0]
        tag = filename.split('-')[1][:-4]
        
        article = f.read().strip()
        lowered = article.lower()

        for key in key_dic:
            index = lowered.find(key)
            if index != -1 and tag not in key_dic[key]["tags"]:
                key_dic[key]["cnt"] += 1
                key_dic[key]["original"].add(article[index:index+len(key)]) # original word not lowered
                key_dic[key]["category"].add(category)
                key_dic[key]["tags"].add(tag)

100%|██████████| 500/500 [00:14<00:00, 34.81it/s]


In [26]:
key_dic

{'concert': {'cnt': 11,
  'category': {'art_and_culture', 'fashion', 'science', 'tech'},
  'original': {'concert'},
  'tags': {'20880868',
   '20893614',
   '20920470',
   '20920830',
   '20922861',
   '20927516',
   '20937808',
   '20944183',
   '20946417',
   '20955083',
   '20956483'}},
 'big announcement': {'cnt': 1,
  'category': {'art_and_culture'},
  'original': {'Big announcement'},
  'tags': {'20880868'}},
 '3-day phestival': {'cnt': 1,
  'category': {'art_and_culture'},
  'original': {'3-Day Phestival'},
  'tags': {'20880868'}},
 'phish phavors new york': {'cnt': 1,
  'category': {'art_and_culture'},
  'original': {'Phish Phavors New York'},
  'tags': {'20880868'}},
 'traffic nightmare': {'cnt': 1,
  'category': {'art_and_culture'},
  'original': {'traffic nightmare'},
  'tags': {'20880868'}},
 'hq': {'cnt': 13,
  'category': {'art_and_culture',
   'business',
   'crime',
   'fashion',
   'politics_world',
   'science',
   'sports',
   'tech'},
  'original': {'HQ', 'hq'},
  '

In [33]:
import nltk

In [38]:
filtered = {}

for key in key_dic:
    pos = nltk.pos_tag([key])[0][1]
    nouns = {'NN', 'NNS', 'NNP', 'NNPS'}
    if pos in nouns:#key_dic[key]['cnt'] >= 5:
        filtered[key] = deepcopy(key_dic[key])

print(len(filtered))

11503


In [45]:
filtered_by_cnt = {}

for key in filtered:
    if 10 <= filtered[key]['cnt'] <= 20:
        filtered_by_cnt[key] = deepcopy(filtered[key])

print(len(filtered_by_cnt))

764


In [46]:
filtered_by_cnt

{'concert': {'cnt': 11,
  'category': {'art_and_culture', 'fashion', 'science', 'tech'},
  'original': {'concert'},
  'tags': {'20880868',
   '20893614',
   '20920470',
   '20920830',
   '20922861',
   '20927516',
   '20937808',
   '20944183',
   '20946417',
   '20955083',
   '20956483'}},
 'hq': {'cnt': 13,
  'category': {'art_and_culture',
   'business',
   'crime',
   'fashion',
   'politics_world',
   'science',
   'sports',
   'tech'},
  'original': {'HQ', 'hq'},
  'tags': {'20834886',
   '20865957',
   '20880868',
   '20898322',
   '20912151',
   '20935665',
   '20938183',
   '20939122',
   '20946555',
   '20946632',
   '20948151',
   '20948907',
   '20949665'}},
 'authorities': {'cnt': 14,
  'category': {'art_and_culture',
   'business',
   'crime',
   'health',
   'politics_world'},
  'original': {'Authorities', 'authorities'},
  'tags': {'20881913',
   '20891485',
   '20905216',
   '20906848',
   '20914483',
   '20931384',
   '20938886',
   '20941789',
   '20943676',
   '20944

In [60]:
compute = {}

for id, key in enumerate(filtered_by_cnt):
    filtered_by_cnt[key]['id'] = id
    compute[id] = filtered_by_cnt[key]['tags']

print(compute)

{0: {'20927516', '20937808', '20944183', '20880868', '20955083', '20893614', '20922861', '20920470', '20946417', '20920830', '20956483'}, 1: {'20912151', '20946632', '20949665', '20938183', '20834886', '20939122', '20880868', '20935665', '20948151', '20948907', '20898322', '20946555', '20865957'}, 2: {'20881913', '20941789', '20974900', '20948990', '20944018', '20906848', '20943676', '20905216', '20938886', '20951365', '20891485', '20931384', '20914483', '20950837'}, 3: {'20934361', '20881913', '20904327', '20974900', '20925205', '20929719', '20951365', '20945190', '20952544', '20943847', '20955365', '20932825'}, 4: {'20938145', '20881913', '20904327', '20945906', '20954255', '20951338', '20950002', '20944536', '20935297', '20936162', '20927497'}, 5: {'20946323', '20937005', '20881913', '20938615', '20927884', '20943847', '20900269', '20932870', '20949287', '20940241', '20952055', '20921578', '20939222', '20943327', '20951992', '20932665', '20890245', '20860321'}, 6: {'20933348', '2093

In [68]:
key_cnt = len(filtered_by_cnt)

matrix = [[1] * key_cnt for _ in range(key_cnt)]
for row in tqdm(range(key_cnt)):
    for col in range(row):
        r = compute[row]
        c = compute[col]
        inter = r & c
        matrix[row][col] = len(inter) / len(r)
        matrix[col][row] = len(inter) / len(c)


100%|██████████| 764/764 [00:00<00:00, 1944.34it/s]


In [64]:
{3, 5} & {4, 5}

{5}

In [57]:
from pytrends.request import TrendReq

pytrends = TrendReq()

keywords = ['Python', 'Java', 'C++', 'JavaScript', 'Ruby']
pytrends.build_payload(keywords, timeframe='2023-05-15 2023-06-14')

data = pytrends.interest_over_time()
print(data)

            Python  Java  C++  JavaScript  Ruby  isPartial
date                                                      
2023-05-15      34    23   94          11     7      False
2023-05-16      36    23   91          11     7      False
2023-05-17      34    24   90          12     7      False
2023-05-18      32    23   95          11     7      False
2023-05-19      32    22   85          10     7      False
2023-05-20      19    15   83           5     7      False
2023-05-21      22    14   89           6     7      False
2023-05-22      35    21   97          10     6      False
2023-05-23      36    23   97          12     6      False
2023-05-24      36    23   95          11     7      False
2023-05-25      35    23   91          11     7      False
2023-05-26      32    22   88          10     6      False
2023-05-27      21    15   87           6     6      False
2023-05-28      19    15   89           5     7      False
2023-05-29      29    21   93           9     6      Fal