In [3]:
from sklearn.cluster import OPTICS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def convet_to_float_array(input: str) -> np.array:
    values_str = input.strip('[]')
    values_list = values_str.split(',')
    # 将字符串列表转换为浮点数数组
    float_array = np.array([float(value) for value in values_list])
    return float_array

# 读取title embedding
title_embedding = np.array(pd.read_csv("../azure_title_test_2_embedding.csv", encoding='utf-8')["title_embedding"].apply(lambda x: convet_to_float_array(x)).tolist())

from sklearn.metrics.pairwise import pairwise_distances
# 计算相似度矩阵
similarity_matrix = 1 - pairwise_distances(title_embedding, metric='cosine')


from sklearn.cluster import OPTICS

# 创建OPTICS对象并拟合相似度矩阵
optics = OPTICS(min_samples=15, xi=0.005, min_cluster_size=15)
clusters = optics.fit_predict(similarity_matrix)

# print(clusters)


# import matplotlib.pyplot as plt

# # 将聚类结果可视化
# colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
# for cluster_id in np.unique(clusters):
#     if cluster_id == -1:
#         # 如果是噪声点，则用黑色表示
#         color = 'k'
#     else:
#         # 否则，使用预定义的颜色
#         color = colors[cluster_id % len(colors)]
#     # 获取属于该聚类簇的样本的索引
#     cluster_indices = np.where(clusters == cluster_id)[0]
#     # 绘制该聚类簇的样本
#     plt.scatter(title_embedding[cluster_indices, 0], title_embedding[cluster_indices, 1], c=color, marker='o')
# plt.title('OPTICS clustering')
# plt.xlabel('Axis X[0]')
# plt.ylabel('Axis X[1]')
# plt.show()

In [4]:
# 对cluster进行分类，以元素值为key，index List作为value的字典
index_cluster_map = {}
for i, cluster_id in enumerate(clusters):
    # -1的元素作为噪声点，丢弃
    if cluster_id != -1:
        if cluster_id not in index_cluster_map:
            index_cluster_map[cluster_id] = []
        index_cluster_map[cluster_id].append(i)

print(index_cluster_map)

{21: [14, 61, 62, 279, 3651, 3654, 3661, 3702, 3834, 4535, 4731, 7783, 8086, 8333, 8379, 8532, 8920, 9067, 9267, 9399], 10: [33, 625, 1180, 1317, 1473, 1833, 2044, 2060, 2216, 2517, 2804, 5563, 5666, 7208, 7988, 8057, 8136, 8139, 8327, 9037, 9074, 9906], 1: [74, 94, 481, 637, 875, 1170, 1245, 1292, 1432, 1433, 1436, 1500, 1501, 1721, 1722, 1761, 1901, 1966, 1968, 2001, 2043, 2077, 2101, 2188, 2278, 2280, 2282, 2359, 2681, 2760, 3297, 5847, 6594, 6704, 6803, 6879, 6985, 7496, 7639, 7720, 8200, 8524, 8638, 8641, 8642, 8648, 8653, 8687, 8934, 9169, 9280, 9392, 9676, 9881, 9884, 9984, 9985, 9987], 6: [96, 100, 103, 104, 106, 109, 112, 114, 115, 118, 120, 121, 122, 123, 128, 130, 131, 133, 134, 139, 141, 145, 153, 154, 157, 290], 13: [129, 260, 1231, 1423, 1677, 1959, 1993, 2027, 2056, 2170, 2601, 2650, 2722, 2809, 3433, 3592, 4103, 5562, 5984, 7045, 7627, 8144, 8409, 8969, 8988, 9114, 9115, 9143], 14: [200, 209, 230, 444, 1464, 1527, 1933, 2073, 2202, 2250, 2299, 2512, 2521, 2549, 2567, 25

In [5]:
source_title_df = pd.read_csv("../azure_title_test_2_embedding.csv")
# 根据cluster_map 里面内容，转化为对应event title的map
title_cluster_map = {}

import pandas as pd

# 将索引列表转换为标题列表
for cluster_id, index_list in index_cluster_map.items():
    title_list = source_title_df.iloc[index_list]['title'].tolist()
    title_cluster_map[cluster_id] = title_list

print(title_cluster_map)

{21: ["Enact Holdings' Q2 Adjusted Earnings Fall, Revenue Rises", 'Axis Capital Holdings Q2 Operating Income, Revenue Rise', 'Axis Capital Holdings Q2 Operating Income, Revenue Rise', "Alight's Q2 Adjusted Earnings, Revenue Increase", 'Agree Realty Q2 Adjusted Funds From Operations, Revenue Rise', 'Agree Realty Q2 Adjusted Funds From Operations, Revenue Rise', 'CCC Intelligent Solutions Q2 Adjusted Earnings, Sales Increase', "Assurant's Q2 Adjusted Earnings, Revenue Rise", 'STERIS Fiscal Q1 Adjusted Net Income, Revenue Rise; Fiscal 2024 Guidance Lifted', 'Unum Group Q2 Profit Increases, Beats Estimates', "Credit Acceptance's Q2 Adjusted Earnings Fall, Revenue Rises", 'MakeMyTrip Fiscal Q1 Adjusted Earnings, Revenue Rise', 'Leidos Q2: 7% Revenue Growth, Higher Demand, Raised FY23 Topline Guidance & More', 'Altria Group Q2: Inline EPS, Reaffirmed FY23 Earnings Guidance & More', "Illinois Tool Works' Q2 Adjusted Earnings, Revenue Rise; 2023 Outlook Raised", 'Allegro MicroSystems Fiscal Q1

In [6]:
title_list_str_map = {}
# 将标题列表拼接成一个字符串
for cluster_id, title_list in title_cluster_map.items():
    title_str = '\n'.join([f'{i+1}. {title}' for i, title in enumerate(title_list)])
    title_list_str_map[cluster_id] = title_str

print(title_list_str_map)

{21: "1. Enact Holdings' Q2 Adjusted Earnings Fall, Revenue Rises\n2. Axis Capital Holdings Q2 Operating Income, Revenue Rise\n3. Axis Capital Holdings Q2 Operating Income, Revenue Rise\n4. Alight's Q2 Adjusted Earnings, Revenue Increase\n5. Agree Realty Q2 Adjusted Funds From Operations, Revenue Rise\n6. Agree Realty Q2 Adjusted Funds From Operations, Revenue Rise\n7. CCC Intelligent Solutions Q2 Adjusted Earnings, Sales Increase\n8. Assurant's Q2 Adjusted Earnings, Revenue Rise\n9. STERIS Fiscal Q1 Adjusted Net Income, Revenue Rise; Fiscal 2024 Guidance Lifted\n10. Unum Group Q2 Profit Increases, Beats Estimates\n11. Credit Acceptance's Q2 Adjusted Earnings Fall, Revenue Rises\n12. MakeMyTrip Fiscal Q1 Adjusted Earnings, Revenue Rise\n13. Leidos Q2: 7% Revenue Growth, Higher Demand, Raised FY23 Topline Guidance & More\n14. Altria Group Q2: Inline EPS, Reaffirmed FY23 Earnings Guidance & More\n15. Illinois Tool Works' Q2 Adjusted Earnings, Revenue Rise; 2023 Outlook Raised\n16. Allegr

In [15]:
import sys
sys.path.append("..")
from openAIRoundRobin import  eventCreationFromTitle

event_map={}
# 遍历cluster_map字典，通过gpt4生成event
for cluster_id, title_str in title_list_str_map.items():
    eventFinal = eventCreationFromTitle.eventCreateByTitle(title_str)
    event_map[cluster_id]= eventFinal

openai index: 0
Q2 Earnings Mixed, Revenues Rise; Companies Raise FY23 Outlook
Q2 Earnings Mixed, Revenues Rise; Companies Raise FY23 Outlook
openai index: 1
瑞幸咖啡Q2净收入增88%，优步实现盈利，小米调整净利润增93%。
瑞幸咖啡Q2净收入增88%，优步实现盈利，小米调整净利润增93%。
openai index: 2
七月底各大公司股东人数公布，高管减持股份成热点。
七月底各大公司股东人数公布，高管减持股份成热点。
openai index: 3
8月2日股市波动：涨跌互现，基金重仓关注个股。
8月2日股市波动：涨跌互现，基金重仓关注个股。
openai index: 4
梅西代言中国白酒，OPPO IoT负责人离职，百胜中国变革。
梅西代言中国白酒，OPPO IoT负责人离职，百胜中国变革。
openai index: 5
Changyang Tech, Guoxin Tech, and Huafeng Tech thrive in R&D, driving future business and expanding markets.
Changyang Tech, Guoxin Tech, and Huafeng Tech thrive in R&D, driving future business and expanding markets.
openai index: 0
Q2 2023 Earnings Calls: Top Companies Report Financial Results
Q2 2023 Earnings Calls: Top Companies Report Financial Results
openai index: 1
日本央行调整YCC政策，维持宽松，关注通胀与经济。
日本央行调整YCC政策，维持宽松，关注通胀与经济。
openai index: 2
全球股市普跌，纳指、标普500下滑，日经225跌2.3%。
全球股市普跌，纳指、标普500下滑，日经225跌2.3%。
openai index: 3
1Q earnings mixed: Aozora Bank, 

In [16]:
cluster_records_df_map ={}

# 把index list的值，转化为csv满足内容的dataframe
for cluster_id, index_list in index_cluster_map.items():
    record_cluster_df = pd.DataFrame(columns=source_title_df.columns)
    record_cluster_df = source_title_df.iloc[index_list]
    cluster_records_df_map[cluster_id] = record_cluster_df

# 把每个clustered dataframe插入event列，同时第一行赋值为相应的gpt4生成的event内容
for cluster_id, record_cluster_df in cluster_records_df_map.items():
    first_index = record_cluster_df.index[0]
    col_event = pd.Series(event_map[cluster_id])        
    record_cluster_df.insert(0, 'event',col_event)
    record_cluster_df.loc[int(first_index),"event"] = event_map[cluster_id]

# 将值列表合并到一个大列表中
merged_cluster_records_df = pd.concat(cluster_records_df_map.values(), axis=0)

print(merged_cluster_records_df)

# 保存到csv文件中
merged_cluster_records_df.to_csv('final_event_optics_result.csv', index=False)

                                                  event   post_id  \
14    Q2 Earnings Mixed, Revenues Rise; Companies Ra...  25648506   
61                                                  NaN  25648005   
62                                                  NaN  25648002   
279                                                 NaN  25647107   
3651                                                NaN  25630638   
...                                                 ...       ...   
9960                                                NaN  25602826   
9965                                                NaN  25602818   
9966                                                NaN  25602815   
9968                                                NaN  25602809   
9969                                                NaN  25602806   

                                                  title     news_time  \
14    Enact Holdings' Q2 Adjusted Earnings Fall, Rev...  1.690960e+12   
61    Axis Capital Holdin