### Import Package

In [1]:
import pandas as pd
import os
import time
import requests 
import tiktoken
import numpy as np
import ray
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from jupyter_dash import JupyterDash
from dash import dcc, html
from dash.dependencies import Input, Output
from openai import OpenAI
from sklearn.metrics import PrecisionRecallDisplay

from utils.system import *
from class_data.data import Data
from class_model.model import Model

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weigfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!





#### Data

In [2]:
# Multiple Articles per Day Data
wsj_multiple = Data(folder_path=get_format_data() / 'token', file_pattern='wsj_tokens_*')
wsj_multiple = wsj_multiple.concat_files()
# Set limit to the exact same value used in embedding_similarity.ipynb to align indexes
limit = 30
count = wsj_multiple.groupby(wsj_multiple.index)['accession_number'].count()
valid_dates_mask = count >= limit
wsj_multiple = wsj_multiple[wsj_multiple.index.isin(count[valid_dates_mask].index)]
print(wsj_multiple.shape)

(830899, 4)


In [3]:
cosine_sim = pd.read_parquet(get_format_data() / 'cosine_sim' / 'wsj_cosine_sim.parquet.brotli')
print(cosine_sim.shape)

(830899, 1)


#### Retrieve Largest Cosine Similarity Article Per Date

In [16]:
cosine_sim_label = [f'cosine_sim_{i}' for i, label in enumerate(labels)]
cosine_sim_change = [f'relu_cosine_sim_{i}' for i, label in enumerate(labels)]
combine = pd.concat([cosine_sim, wsj_multiple], axis=1)
combine[cosine_sim_change] = np.maximum(0, cosine_sim[cosine_sim_label] - 0.75)
combine['cosine_sim_sum'] = combine[cosine_sim_change].sum(axis=1).to_frame()
combine.index.names = ['date']

In [17]:
max_cosine_sim_sum = combine.groupby('date')['cosine_sim_sum'].transform('max')
mask = combine['cosine_sim_sum'] == max_cosine_sim_sum
max_article = combine[mask]

#### Interactive Plot

In [18]:
app = JupyterDash(__name__)

app.layout = html.Div([
    dcc.Graph(id='timeseries-plot', figure=px.line(max_article, x=max_article.index, y='cosine_sim_sum', title='Cosine Similarity Timeseries')),
    html.Div(id='text-output')
])

@app.callback(
    Output('text-output', 'children'),
    [Input('timeseries-plot', 'clickData')]
)
def display_click_data(clickData):
    if clickData is not None:
        date_str = clickData['points'][0]['x']
        article_text = max_article.loc[date_str, 'body_txt']
        article_headline = max_article.loc[date_str, 'headline']
        return html.Div([
            html.H4(f"Date: {date_str}"),
            html.H4(f"Headline: {article_headline}", style={'font-weight': 'bold'}),
            html.P(article_text)
        ])
    return "Click on a point to see the article details."

app.run_server(mode='external')

Dash app running on http://127.0.0.1:8050/
