In [32]:
import pandas as pd
import tensorflow_hub as hub
import numpy as np
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

import warnings

warnings.filterwarnings('ignore')

In [33]:
df = pd.read_csv('analyst_ratings_processed.csv')

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,stock
0,0.0,Stocks That Hit 52-Week Highs On Friday,2020-06-05 10:30:00-04:00,A
1,1.0,Stocks That Hit 52-Week Highs On Wednesday,2020-06-03 10:45:00-04:00,A
2,2.0,71 Biggest Movers From Friday,2020-05-26 04:30:00-04:00,A
3,3.0,46 Stocks Moving In Friday's Mid-Day Session,2020-05-22 12:45:00-04:00,A
4,4.0,B of A Securities Maintains Neutral on Agilent...,2020-05-22 11:38:00-04:00,A


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400469 entries, 0 to 1400468
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   Unnamed: 0  1399180 non-null  float64
 1   title       1400469 non-null  object 
 2   date        1399180 non-null  object 
 3   stock       1397891 non-null  object 
dtypes: float64(1), object(3)
memory usage: 42.7+ MB


In [36]:
df.isna().sum()

Unnamed: 0    1289
title            0
date          1289
stock         2578
dtype: int64

In [37]:
sample_size = 30000
df = df.sample(n=sample_size, random_state=42)

In [38]:
df.dropna(subset=['title'], inplace=True)
texts = df['title'].tolist()

In [39]:
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use_model = hub.load(model_url)

In [40]:
def get_embeddings(texts):
  embeddings = use_model(texts)
  return np.array(embeddings)

In [41]:
text_embeddings = get_embeddings(texts)
text_embeddings.shape

(30000, 512)

In [42]:
normalized_embeddings = normalize(text_embeddings, norm='l2')

In [43]:
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(normalized_embeddings)

In [44]:
cluster_labels = kmeans.labels_
df['cluster'] = cluster_labels
df['cluster'].value_counts()

cluster
8    6670
1    4204
0    4151
2    4145
3    3039
4    2782
6    2720
9     890
7     829
5     570
Name: count, dtype: int64

In [45]:
def create_knn_classifier(k, embeddings, labels):
  knn_classifier = KNeighborsClassifier(n_neighbors=k, metric='cosine')
  knn_classifier.fit(embeddings, labels)
  return knn_classifier

In [46]:
k_value = 5
knn_model = create_knn_classifier(k_value, normalized_embeddings, cluster_labels)

k-NN с k=5'.


In [47]:
def semantic_search_all(query_text, model, all_embeddings, all_texts_df, top_n=5):
  query_embedding = model([query_text])
  query_embedding_normalized = normalize(query_embedding, norm='l2')

  similarities = cosine_similarity(query_embedding_normalized, all_embeddings)[0]

  sorted_indices = np.argsort(similarities)[::-1]

  results = []
  for i in sorted_indices[:top_n]:
      results.append({
          'text': all_texts_df.iloc[i]['title'],
          'similarity': similarities[i],
          'cluster': all_texts_df.iloc[i]['cluster']
      })
  return results

In [48]:
search_query = "Big tech stocks rally on positive earnings news"
search_results_all = semantic_search_all(search_query, use_model, normalized_embeddings, df, top_n=5)
print(f"\nРезультаты поиска по всему датасету для запроса: '{search_query}'")
for res in search_results_all:
  print(f"- Сходство: {res['similarity']:.4f}, Кластер: {res['cluster']}, Текст: {res['text']}")


Результаты поиска по всему датасету для запроса: 'Big tech stocks rally on positive earnings news'
- Сходство: 0.6333, Кластер: 4, Текст: Strong Bullish Stocks to Watch next week
- Сходство: 0.6315, Кластер: 4, Текст: NASDAQ Stocks Hitting 52-Week Lows
- Сходство: 0.6308, Кластер: 1, Текст: U.S. Stocks Fall; iRobot Shares Gain On Upbeat Results
- Сходство: 0.6260, Кластер: 2, Текст: The Week Ahead In Biotech: COVID-19 Stocks In The Spotlight, Earnings Taper Off
- Сходство: 0.6208, Кластер: 1, Текст: US Stock Futures Rise Ahead Of Earnings, Consumer Confidence Data


In [52]:
def semantic_search_within_cluster(query_text, model, knn_classifier_instance, all_embeddings, all_texts_df, top_n=5):
    query_embedding = model([query_text])
    query_embedding_normalized = normalize(query_embedding, norm='l2')

    predicted_cluster = knn_classifier_instance.predict(query_embedding_normalized)[0]
    print(f"Запрос '{query_text}' отнесен k-NN (k={knn_classifier_instance.n_neighbors}) к кластеру: {predicted_cluster}")

    cluster_indices = np.where(all_texts_df['cluster'] == predicted_cluster)[0]

    if len(cluster_indices) == 0:
        print(f"В кластере {predicted_cluster} нет текстов. Поиск невозможен.")
        return []

    cluster_embeddings = all_embeddings[cluster_indices]
    cluster_df_subset = all_texts_df.iloc[cluster_indices]

    similarities = cosine_similarity(query_embedding_normalized, cluster_embeddings)[0]

    num_results = min(top_n, len(similarities))
    sorted_local_indices = np.argsort(similarities)[::-1][:num_results]
    results = [
        {
            'text': cluster_df_subset.iloc[local_idx]['title'],
            'similarity': similarities[local_idx],
            'cluster': predicted_cluster
        }
        for local_idx in sorted_local_indices
    ]
    return results

In [53]:
for k_val in [3, 7, 15]:
    print(f"\nКластер с k-NN (k={k_val})")
    temp_knn_model = create_knn_classifier(k_val, normalized_embeddings, cluster_labels)
    search_results_cluster = semantic_search_within_cluster(search_query, use_model, temp_knn_model, normalized_embeddings, df, top_n=5)
    if search_results_cluster:
        for res in search_results_cluster:
          print(f"- Сходство: {res['similarity']:.4f}, Текст: {res['text']}")


Кластер с k-NN (k=3)
k-NN с k=3'.
Запрос 'Big tech stocks rally on positive earnings news' отнесен k-NN (k=3) к кластеру: 4
- Сходство: 0.6333, Текст: Strong Bullish Stocks to Watch next week
- Сходство: 0.6315, Текст: NASDAQ Stocks Hitting 52-Week Lows
- Сходство: 0.6168, Текст: Big Financial Stocks Nearing Intraday Lows
- Сходство: 0.6033, Текст: Market Wrap for Wednesday, May 29: Stocks Fall on Profit Taking After Big Rally 
- Сходство: 0.6033, Текст: Stocks That Hit 52-Week Lows On Friday

Кластер с k-NN (k=7)
k-NN с k=7'.
Запрос 'Big tech stocks rally on positive earnings news' отнесен k-NN (k=7) к кластеру: 4
- Сходство: 0.6333, Текст: Strong Bullish Stocks to Watch next week
- Сходство: 0.6315, Текст: NASDAQ Stocks Hitting 52-Week Lows
- Сходство: 0.6168, Текст: Big Financial Stocks Nearing Intraday Lows
- Сходство: 0.6033, Текст: Market Wrap for Wednesday, May 29: Stocks Fall on Profit Taking After Big Rally 
- Сходство: 0.6033, Текст: Stocks That Hit 52-Week Lows On Friday

К

In [51]:
results = semantic_search_all(search_query, use_model, normalized_embeddings, df)
for res in results:
    print(f"- Сходство: {res['similarity']:.4f},| Текст: {res['text']}", f'| Кластер:{res['cluster']}')

- Сходство: 0.6333,| Текст: Strong Bullish Stocks to Watch next week | Кластер:4
- Сходство: 0.6315,| Текст: NASDAQ Stocks Hitting 52-Week Lows | Кластер:4
- Сходство: 0.6308,| Текст: U.S. Stocks Fall; iRobot Shares Gain On Upbeat Results | Кластер:1
- Сходство: 0.6260,| Текст: The Week Ahead In Biotech: COVID-19 Stocks In The Spotlight, Earnings Taper Off | Кластер:2
- Сходство: 0.6208,| Текст: US Stock Futures Rise Ahead Of Earnings, Consumer Confidence Data | Кластер:1
