In [2]:
import numpy as np

from functions.embeddings.context import generate_benchmark_embeddings, load_data
from functions.embeddings.embed import generate_embedding
from constants import SETTINGS


def fetch_benchmark_embeddings():
  benchmark_content = load_data(SETTINGS.rss_feed_file)

  content_embeddings, title_embeddings, summary_embeddings = generate_benchmark_embeddings(benchmark_content)
  

  np_content_embeddings = np.array(content_embeddings)
  np_title_embeddings = np.array(title_embeddings)
  np_summary_embeddings = np.array(summary_embeddings)

  print('Printing sizes')
  print(np_content_embeddings.shape)
  print(np_title_embeddings.shape)
  print(np_summary_embeddings.shape)

  return {
    'content_embeddings': np_content_embeddings,
    'title_embeddings': np_title_embeddings,
    'summary_embeddings': np_summary_embeddings,
  }


data = fetch_benchmark_embeddings()

Generating embeddings for example content...
Generating embeddings for:   AWS Glue now can detect 250 sensitive entity types from over 50 countries 
Generating embeddings for:   AWS Step Functions launches Versions and Aliases 
Generating embeddings for:   Announcing the AWS Amplify UI Builder Figma plugin 
Generating embeddings for:   AWS Lambda supports starting from timestamp for Kafka event sources 
Generating embeddings for:   Amazon CloudWatch Logs announces new Log Insights dedup command  
Generating embeddings for:   Amazon DynamoDB now simplifies and lowers the cost of handling failed conditional writes 
Generating embeddings for:   Amazon S3 provides restore status of S3 Glacier objects using the S3 LIST API 
Generating embeddings for:   AWS Database Migration Service now provides more comprehensive premigration assessments 
Generating embeddings for:   Amazon CloudWatch now supports dashboard variables 
Generating embeddings for:   Amazon Simple Email Service now supports me

In [5]:
from constants import SETTINGS
def save_benchmark_embeddings(data):
  fields = ['content', 'title', 'summary']

  for field in fields:
    embeddings = data[f'{field}_embeddings']
    print(f'Saving {field} embeddings')
    file_path = f'{SETTINGS.embeddings_output_dir}/benchmark_{field}.npy'
    np.save(file_path, embeddings)

save_benchmark_embeddings(data)

Saving content embeddings
Saving title embeddings
Saving summary embeddings


In [8]:
from sklearn import metrics
from sklearn.cluster import AffinityPropagation

def create_clusters(field):

  def cluster_embeddings(embeddings):
    clustering = AffinityPropagation().fit(embeddings)
    return clustering

  embeddings = np.load(f'{SETTINGS.embeddings_output_dir}/benchmark_{field}.npy')
  clustering = cluster_embeddings(embeddings)

  print(f"""
  {field} Clustering
  Number of clusters: {len(clustering.cluster_centers_indices_)}
  Silhouette Score: {metrics.silhouette_score(embeddings, clustering.labels_)}
  """)

for field in ['content', 'title', 'summary']:
  create_clusters(field)


  content Clustering
  Number of clusters: 5
  Silhouette Score: 0.06386707535949496
  

  title Clustering
  Number of clusters: 7
  Silhouette Score: 0.046894887377231464
  

  summary Clustering
  Number of clusters: 7
  Silhouette Score: 0.07047154709467535
  


In [4]:
import json
from constants import SETTINGS

with open(SETTINGS.dot_products_file, 'r') as f:
  file_data = f.read()
  dot_product_data = json.loads(file_data)


relevance_data = {}
for article in dot_product_data:
  title = article['title']
  relevance_data[title] = {}
  for field in ['content', 'title' , 'summary']:
    dot_products = article[f'{field}_relevance']
    max_dot_products = max(dot_products)
    max_5_dot_products = sum(sorted(dot_products, reverse=True)[:5]) / 5
    avg_dot_products = sum(dot_products) / len(dot_products)

    
    relevance_data[title][field] = {
      'max': max_dot_products,
      'max_5': max_5_dot_products,
      'avg': avg_dot_products
    }

file_path = f'{SETTINGS.ranks_output_dir}/unranked.json'
with open(file_path, 'w') as f:
  f.write(json.dumps(relevance_data, indent=2))


# Order by summary max relevance
summary_relevance = sorted(relevance_data.items(), key=lambda x: x[1]['summary']['max'], reverse=True)
summary_relevance = {k: v for k, v in summary_relevance}
file_path = f'{SETTINGS.ranks_output_dir}/summary_max.json'
with open(file_path, 'w') as f:
  save_data = {index: { 'title': title, 'summary_max': data['summary']['max']} for index, (title, data) in enumerate(summary_relevance.items())}
  f.write(json.dumps(save_data, indent=2))

# Order by title max relevance
title_relevance = sorted(relevance_data.items(), key=lambda x: x[1]['title']['max'], reverse=True)
title_relevance = {k: v for k, v in title_relevance}
file_path = f'{SETTINGS.ranks_output_dir}/title_max.json'
with open(file_path, 'w') as f:
  save_data = {index: { 'title': title, 'title_max': data['title']['max']} for index, (title, data) in enumerate(title_relevance.items())}
  f.write(json.dumps(save_data, indent=2))

# Order by content max relevance
content_relevance = sorted(relevance_data.items(), key=lambda x: x[1]['content']['max'], reverse=True)
content_relevance = {k: v for k, v in content_relevance}
file_path = f'{SETTINGS.ranks_output_dir}/content_max.json'
with open(file_path, 'w') as f:
  save_data = {index: { 'title': title, 'content_max': data['content']['max']} for index, (title, data) in enumerate(content_relevance.items())}
  f.write(json.dumps(save_data, indent=2))
          
