#### Import Package

In [80]:
import pandas as pd
import os
import time
import requests 
import tiktoken
import numpy as np
import ray
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
import openai
nltk.download('punkt')

from nltk.tokenize import sent_tokenize
from jupyter_dash import JupyterDash
from dash import dcc, html
from dash.dependencies import Input, Output
from openai import OpenAI
from sklearn.metrics import PrecisionRecallDisplay
from scipy.cluster import hierarchy as sch
from bertopic import BERTopic

from utils.system import *
from class_data.data import Data
from class_model.model import Model

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\weigfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Data

In [2]:
# Multiple Articles per Day Data
wsj_multiple = Data(folder_path=get_format_data() / 'token', file_pattern='wsj_tokens_*')
wsj_multiple = wsj_multiple.concat_files()
# Set limit to the exact same value used in embedding_similarity.ipynb to align indexes
limit = 30
count = wsj_multiple.groupby(wsj_multiple.index)['accession_number'].count()
valid_dates_mask = count >= limit
wsj_multiple = wsj_multiple[wsj_multiple.index.isin(count[valid_dates_mask].index)]
print(wsj_multiple.shape)

(830899, 4)


In [3]:
cosine_sim = pd.read_parquet(get_format_data() / 'cosine_sim' / 'wsj_cosine_sim.parquet.brotli')
print(cosine_sim.shape)

(830899, 1)


#### Retrieve Top N% Cosine Similarity Article Per Date

In [9]:
# Merge cosine_sim with matching article
num_label = len(cosine_sim.columns)
cosine_sim_label = [f'cosine_sim_{i}' for i in range(num_label)]
cosine_sim_change = [f'relu_cosine_sim_{i}' for i in range(num_label)]
combine = pd.concat([cosine_sim, wsj_multiple], axis=1)
# Apply Relu Transformation to each column
combine[cosine_sim_change] = np.maximum(0, cosine_sim[cosine_sim_label] - 0.75)
# Aggregate across label columns
combine['cosine_sim_mean'] = combine[cosine_sim_change].mean(axis=1).to_frame()
combine.index.names = ['date']

In [15]:
# Select top N% of each group
def top_n_per(group, N_percent):
    n = int(len(group) * N_percent)
    return group.head(n)

In [16]:
N_percent = 0.10
# Make Sure to Sort
combine = combine.sort_values(['cosine_sim_mean'], ascending=False)
top_n = combine.groupby('date').apply(top_n_per, N_percent).reset_index(level=0, drop=True)

#### Parallelized: ChatGPT Summarization

In [68]:
@ray.remote
def summarize_article(article_text):
    api_key = json.load(open(get_config() / 'api.json'))['openai_api_key']
    client = OpenAI(api_key=api_key)
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "user", "content": f"Summarize the following article in two sentences:\n\n{article_text}"}
        ]
    )
    summary = response.choices[0].message.content.strip()
    summary = summary.replace('\xa0', ' ')
    return summary

def summarize_in_batches(df, new_column_name, column_name, batch_size):
    num_batches = np.ceil(len(df) / batch_size)
    all_summary = []
    print(f"Number of batches: {int(num_batches)}")
    for i in range(int(num_batches)):
        print(f"Processing batch: {i + 1}/{int(num_batches)}")
        start_index = i * batch_size
        end_index = min(start_index + batch_size, len(df))
        batch = df[column_name][start_index:end_index]
        
        # Start asynchronous tasks for the batch
        futures = [summarize_article.remote(text) for text in batch]
        embeddings = ray.get(futures)

        # Update lists
        all_summary.extend(embeddings)

    df[new_column_name] = all_summary
    return df

In [69]:
test = top_n.head(5)

In [72]:
# Parameters
batch_size = 5
new_col = 'gpt_summary'
art_col = 'body_txt'

# Process articles in batches
ray.init(num_cpus=16, ignore_reinit_error=True)

start_time = time.time()
summary = summarize_in_batches(test, new_col, art_col, batch_size)
elapsed_time = time.time() - start_time
print(f"Total time to get all embeddings: {round(elapsed_time)} seconds")

# Shutdown Ray
ray.shutdown()

2024-01-09 16:19:50,267	INFO worker.py:1673 -- Started a local Ray instance.


Number of batches: 1
Processing batch: 1/1
Total time to get all embeddings: 6 seconds


In [77]:
summary['body_txt'][1]

'CANTON Mass.  Ovation Technologies a closely held yearold computersoftware company which some observers think will be the next hot company in the booming industry said it raised 5.5 million from venture capitalists to develop and market its product. Computersoftware companies are currently appealing to venture capitalists because their role in the personalcomputer industry is booming and appears somewhat less risky than the computerhardware market which is undergoing a shakeout. Ovations financing is among the largest for software firms according to Stanley Pratt publisher of Venture Capital Journal. Ovation has yet to market its product an integrated software package that combines wordprocessing financial analysis and graphics in a single 795 program. However industry analysts who have seen it say it surpasses current products. I wasnt prepared to be impressed but I was said Esther Dyson president of Rosen Research Inc. a New York City firm that follows the personalcomputer industry.

In [76]:
summary['gpt_summary'][1]

"Ovation Technologies, a privately-held computer software company focused on an integrated software package, has raised $5.5m\xa0from venture capitalists to develop and market its product. Ovation's package, which includes word-processing, financial analysis, and graphics all in a single program, is set to surpass current products on the market by featuring an easy-to-use command system."

#### BERT Hierarchical Clustering

In [None]:
# Extract Summaries
docs = summary['gpt_summary'].tolist()

# BERTopic
# Note: Must install https://visualstudio.microsoft.com/visual-cpp-build-tools/ --> C++ Build Tool --> for hbdscan package within bertopic
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

# Define linkage function for hierarchical topic modeling
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)

# Generate hierarchical topics
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

In [None]:
# Display Tree
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)