#### Import Packages

In [1]:
import pandas as pd
import pickle
import json
import numpy as np

from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

from utils.system import *
from class_data.data import Data
from class_generate.generate import Generate

import warnings
warnings.filterwarnings('ignore')

#### Load Data

In [2]:
# Load tfidf
path = get_root_dir() / 'exec_bow' / 'tfidf.pkl'
with open(path, 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

In [3]:
# Load tfidf data
wsj_tfidf = Data(folder_path=get_format_data() / 'tfidf', file_pattern='wsj_tfidf_*')
wsj_tfidf = wsj_tfidf.concat_files()

In [4]:
# Load articles
wsj_art = Data(folder_path=get_format_data() / 'token', file_pattern='wsj_tokens_*')
wsj_art = wsj_art.concat_files()

#### TFIDF Plot

In [5]:
# Params
type = 'tfidf'
vector_column = 'tfidf'
method = 'cos_sim'
interval = 'M'

In [7]:
query = 'Generate an index with label ESG from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
esg = generate.generate_tfidf()
esg.plot(figsize=(30,10))

Here is the query: 
{'unigram': ['esg', 'environmental', 'governance'], 'bigram': ['environmental social', 'social governance', 'esg stands'], 'trigram': ['environmental social governance', 'esg stands for', 'financial performance and'], 'start_date': '1984-01-02', 'end_date': '2021-12-31', 'label': 'ESG stands for Environmental, Social, and Governance. These are the three key factors when measuring the sustainability and ethical impact of an investment in a company or business. ESG factors are a subset of non-financial performance indicators which include sustainable, ethical and corporate governance issues such as managing the company’s carbon footprint and ensuring there are systems in place to ensure accountability.'}
------------------------------------------------------------
Computing score...
------------------------------------------------------------
Processing ngram: esg



KeyboardInterrupt



In [None]:
query = 'Generate an index with label Economic Policy Uncertainty from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
epu = generate.generate_tfidf()
epu.plot(figsize=(30,10))

In [None]:
query = 'Generate an index with label US-China trade war from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
trade_war = generate.generate_tfidf()
trade_war.plot(figsize=(30,10))

In [None]:
query = 'Generate an index with label artficial intelligence from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
ai = generate.generate_tfidf()
ai.plot(figsize=(30,10))

In [None]:
query = 'Generate an index with label blockchain from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
blockchain = generate.generate_tfidf()
blockchain.plot(figsize=(30,10))

In [None]:
query = 'Generate an index with label covid from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
covid = generate.generate_tfidf()
covid.plot(figsize=(30,10))

In [None]:
query = 'Generate an index with label economic recession from 1984-01-02 to 2021-12-31'
generate = Generate(query=query, 
                    type=type, 
                    vector_data=wsj_tfidf, 
                    vector_column=vector_column, 
                    article_data=wsj_art, 
                    tfidf=tfidf_vectorizer,
                    method=method,
                    interval=interval)
recess = generate.generate_tfidf()
recess.plot(figsize=(30,10))