#### Import Packages

In [1]:
import pandas as pd
import pickle
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

from openai import OpenAI
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

from utils.system import *
from class_data.data import Data
from class_generate.gen_emb import GenEmb
from class_eval.eval_index import EvalIndex

import warnings
warnings.filterwarnings('ignore')

#### Load Data

In [None]:
# Load openai embeddings
wsj_openai = Data(folder_path=get_format_data() / 'openai', file_pattern='wsj_emb_openai_*')
wsj_openai = wsj_openai.concat_files()

In [None]:
# Load articles
wsj_art = Data(folder_path=get_format_data() / 'token', file_pattern='wsj_tokens_*')
wsj_art = wsj_art.concat_files()

#### Embedding Plot

In [5]:
# Params
vector_column = 'ada_embedding'
interval = 'M'
art_col = 'body_txt'
eval_col = 'eval'
threshold = 0.77

In [None]:
query = 'Generate an index with label ESG from January 1st, 1984, to December 31st, 2021.'
generate = GenEmb(query=query, vector_data=wsj_openai,  vector_column=vector_column, article_data=wsj_art,  interval=interval, threshold=threshold)
esg = generate.generate_emb()
eval_index = EvalIndex(index=esg, label=generate.query['label'], art_col=art_col, eval_col=eval_col, batch_size=1)
eval_esg = eval_index.eval_articles()
eval_index.count(eval_esg)
eval_esg.to_parquet(get_format_data() / 'eval' / 'eval_esg.parquet.brotli', compression='brotli')
generate.exec_plot(query, generate.query['label'], esg[['score']], ['ESG'], 'esg_index')