In [1]:
!pip install git+https://github.com/impresso/impresso-py.git@embeddings-search

Collecting git+https://github.com/impresso/impresso-py.git@embeddings-search
  Cloning https://github.com/impresso/impresso-py.git (to revision embeddings-search) to /tmp/pip-req-build-6v31s20t
  Running command git clone --filter=blob:none --quiet https://github.com/impresso/impresso-py.git /tmp/pip-req-build-6v31s20t
  Running command git checkout -b embeddings-search --track origin/embeddings-search
  Switched to a new branch 'embeddings-search'
  Branch 'embeddings-search' set up to track remote branch 'embeddings-search' from 'origin'.
  Resolved https://github.com/impresso/impresso-py.git to commit a5fd1a1fbb4b130b3b96d7483e92e6eadf763f71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from impresso import connect

impresso_session = connect('https://dev.impresso-project.ch/public-api/v1')

🎉 You are now connected to the Impresso API!  🎉
🔗 Using API: https://dev.impresso-project.ch/public-api/v1


In [3]:
# ---embed and search---

question = "What happened in Zurich in the year 1950?"


embedding = impresso_session.tools.embed_text(text=question, target="text")

# RETRIEVAL PREPARATION

In [4]:
result = impresso_session.search.find(embedding=embedding, limit=20)

# using the embedding to search in impresso
# get ids of search results
result_uids = [r["uid"] for r in result.raw["data"]]

result_articles = [impresso_session.content_items.get(uid) for uid in result_uids]

In [5]:
result

Unnamed: 0_level_0,copyrightStatus,type,sourceMedium,title,locationEntities,personEntities,organisationEntities,newsAgenciesEntities,topics,transcriptLength,totalPages,languageCode,isOnFrontPage,publicationDate,issueUid,countryCode,providerCode,mediaUid,mediaType
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
excelsior-1919-06-16-a-i0030,in_cpy,ar,print,"EN SUISSE : A ZURICH, DES ÉMEUTIERS FONT METTR...","[{'uid': '2-54-Zurich', 'count': 2}, {'uid': '...",[],[],[],"[{'uid': 'tm-fr-all-v2.0_tp05_fr', 'relevance'...",265,1,fr,False,1919-06-16T00:00:00+00:00,excelsior-1919-06-16-a,FR,BNF,excelsior,newspaper
DTT-1953-10-26-a-i0089,in_cpy,ar,print,"""hreuie vor","[{'uid': '2-54-Schweiz', 'count': 1}, {'uid': ...","[{'uid': '2-50-Émile_Picard', 'count': 1}]",[],[],"[{'uid': 'tm-de-all-v2.0_tp57_de', 'relevance'...",122,1,de,False,1953-10-26T00:00:00+00:00,DTT-1953-10-26-a,CH,Migros,DTT,newspaper
IMP-1959-04-24-a-i0042,in_cpy,ar,print,L'ACTUALITÉ SUISSE,[],[],[],[],"[{'uid': 'tm-fr-all-v2.0_tp97_fr', 'relevance'...",169,1,fr,False,1959-04-24T00:00:00+00:00,IMP-1959-04-24-a,CH,SNL,IMP,newspaper


In [6]:
def format_and_display_articles(articles, fields=["title", "publicationDate", "languageCode", "transcript"], transcript_line_characters=100, max_transcript_characters=1500, return_string=False):
  """
  Formats a list of article objects into a string representation and optionally prints it.
  Includes transcript length and allows limiting the printed transcript length.

  Args:
    articles: A list of article objects (with a .raw attribute containing a dictionary).
    fields: A list of strings representing the fields to include for each article.
    transcript_line_length: The maximum number of characters per line for the transcript when printed.
    max_transcript_length: The maximum number of characters to print for the transcript.
                           If None, the entire transcript is printed.
    return_string: If True, the function returns a single string containing the formatted
                   article information. If False, it prints the information.
  """
  formatted_output = []
  for i, article in enumerate(articles):
    article_output = [f"--- Article {i+1} ---"]
    for field in fields:
      if field == "title":
        title = article.raw.get(field)
        article_output.append(f"Title: {title if title is not None else 'No Title Found'}")
      elif field == "publicationDate":
        date = article.raw.get(field)
        if date is not None:
          article_output.append(f"Publicationdate: {date.split('T')[0]}")
      elif field == "languageCode":
        lang = article.raw.get(field)
        if lang is not None:
            article_output.append(f"Languagecode: {lang}")
      elif field == "transcript":
        transcript = article.raw.get(field)
        if transcript is not None:
            transcript_length = len(transcript)
            article_output.append(f"Transcript Length: {transcript_length} characters")
            article_output.append(f"{field.capitalize()}:")
            transcript_to_print = transcript[:max_transcript_characters] if max_transcript_characters is not None else transcript
            if max_transcript_characters is not None and len(transcript) > max_transcript_characters:
                transcript_to_print += "... [truncated]"
            # Split transcript into chunks for better readability
            for j in range(0, len(transcript_to_print), transcript_line_characters):
              article_output.append(transcript_to_print[j:j+transcript_line_characters])

    article_output.append("-" * (len(f"--- Article {i+1} ---")))
    formatted_output.append("\n".join(article_output))

  if return_string:
    return "\n\n".join(formatted_output)
  else:
    print("\n\n".join(formatted_output))


# Example usage with max_transcript_length and returning string
# formatted_articles_string = format_and_display_articles(result_articles, max_transcript_length=1000, return_string=True)
# print("\n--- Formatted String Output ---")
# print(formatted_articles_string)

# Example usage with printing
format_and_display_articles(result_articles)

--- Article 1 ---
Title: EN SUISSE : A ZURICH, DES ÉMEUTIERS FONT METTRE EN LIBERTÉ UN PRISONNI[...]
Publicationdate: 1919-06-16
Languagecode: fr
Transcript Length: 1669 characters
Transcript:
ZURICH, -15 juin. — Des troubles assez sérieux ont éclaté à Zurich, où, comme on le sait, existe une
 minorité socialiste extrémiste des plus actives. Une grande manifestation avait été organisée, vend
redi soir, pair l'Union ouvrière zurichoise, pour honorer la mémoire de Posa Luxembourg. Le préfet d
e police de Zurich, le socialiste Greber, sur la promesse que la manifestation se déroulerait dans l
e calme, avait donné des instructions pour que la polioe.se tînt éloignée de la réunion. Celle-ci co
mmença bien; mais un orateur vint annoncer que le délégué socialiste suisse. Conrad Wyss venait, en 
rentrant d'Allemagne, d'être arrêté par les autorités fédérales, pour avoir introduit des brochures 
et des tracts de propagande. Aussi tôt la foule se porta vers la prison et réclama la mi.se en liber

In [8]:
from scipy.stats import spearmanr

def calculate_overlap_between_rank_and_reranking(original_articles, reranked_articles):
  """
  Calculates and prints ranking metrics between the original and reranked article lists.

  Args:
    original_articles: The original list of article objects.
    reranked_articles: The reranked list of article objects.
  """
  # Get the UIDs of the articles in the original and reranked lists
  original_uids = [article.raw["uid"] for article in original_articles]
  reranked_uids = [article.raw["uid"] for article in reranked_articles]

  # Create ranking lists based on UIDs
  original_ranking = [original_uids.index(uid) for uid in reranked_uids]
  reranked_ranking = list(range(len(reranked_uids)))

  # Calculate Spearman correlation
  spearman_corr, _ = spearmanr(original_ranking, reranked_ranking)

  print(f"Spearman correlation between original and reranked rankings: {spearman_corr:.4f}")

  # Calculate overlap in top 5 and top 10
  original_top_3 = set(original_uids[:3])
  reranked_top_3 = set(reranked_uids[:3])
  overlap_3 = len(original_top_3.intersection(reranked_top_3))
  print(f"Overlap in top 3 results: {overlap_3}")

  # Calculate overlap in top 5 and top 10
  original_top_5 = set(original_uids[:5])
  reranked_top_5 = set(reranked_uids[:5])
  overlap_5 = len(original_top_5.intersection(reranked_top_5))
  print(f"Overlap in top 5 results: {overlap_5}")

  original_top_10 = set(original_uids[:10])
  reranked_top_10 = set(reranked_uids[:10])
  overlap_10 = len(original_top_10.intersection(reranked_top_10))
  print(f"Overlap in top 10 results: {overlap_10}")

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

def rerank_articles_by_other_embedding_model(original_articles, question, embedding_model):
  """
  Reranks a list of articles based on their relevance to a given question using a different embedding model.

  Args:
    articles: A list of article objects (with a .raw attribute containing a dictionary).
    question: The question string.
    embedding_model: The SentenceTransformer model to use for generating embeddings.

  Returns:
    A list of article objects, sorted by relevance to the question in descending order.
  """
  # Combine title and transcript for article text
  article_texts = [str(article.raw.get("title")) + " " + article.raw.get("transcript") for article in original_articles]

  # Generate embeddings for articles and question
  article_embeddings = embedding_model.encode(article_texts)
  question_embedding = embedding_model.encode(question)

  # Calculate cosine similarity using numpy
  similarity_scores = np.dot(article_embeddings, question_embedding.reshape(-1, 1)).flatten() / (np.linalg.norm(article_embeddings, axis=1) * np.linalg.norm(question_embedding))

  # Pair articles with their scores and sort
  scored_articles = sorted(zip(original_articles, similarity_scores), key=lambda x: x[1], reverse=True)

  # Return only the sorted articles
  reranked_articles = [article for article, score in scored_articles]

  calculate_overlap_between_rank_and_reranking(original_articles=original_articles, reranked_articles=reranked_articles)
  return reranked_articles

# Example Usage (requires loading a SentenceTransformer model)
embedding_model = SentenceTransformer('impresso-project/halloween_workshop_ocr_robust_preview', trust_remote_code=True)
reranked_articles = rerank_articles_by_other_embedding_model(result_articles, question, embedding_model=embedding_model)

Spearman correlation between original and reranked rankings: -0.4556
Overlap in top 3 results: 0
Overlap in top 5 results: 1
Overlap in top 10 results: 3


In [10]:
format_and_display_articles(result_articles)

--- Article 1 ---
Title: EN SUISSE : A ZURICH, DES ÉMEUTIERS FONT METTRE EN LIBERTÉ UN PRISONNI[...]
Publicationdate: 1919-06-16
Languagecode: fr
Transcript Length: 1669 characters
Transcript:
ZURICH, -15 juin. — Des troubles assez sérieux ont éclaté à Zurich, où, comme on le sait, existe une
 minorité socialiste extrémiste des plus actives. Une grande manifestation avait été organisée, vend
redi soir, pair l'Union ouvrière zurichoise, pour honorer la mémoire de Posa Luxembourg. Le préfet d
e police de Zurich, le socialiste Greber, sur la promesse que la manifestation se déroulerait dans l
e calme, avait donné des instructions pour que la polioe.se tînt éloignée de la réunion. Celle-ci co
mmença bien; mais un orateur vint annoncer que le délégué socialiste suisse. Conrad Wyss venait, en 
rentrant d'Allemagne, d'être arrêté par les autorités fédérales, pour avoir introduit des brochures 
et des tracts de propagande. Aussi tôt la foule se porta vers la prison et réclama la mi.se en liber

In [11]:
format_and_display_articles(reranked_articles)

--- Article 1 ---
Title: d octobre 1950 BULLETIN DE BOURSE
Publicationdate: 1950-10-05
Languagecode: fr
Transcript Length: 2103 characters
Transcript:
d octobre 1950 BULLETIN DE BOURSE u E Zurich :. Cour _.', Obligations A 5 VA% Féd. 42 / ms 101-25 d'
«» 3%% Féd. 43 / av. 10675 d 1 t) 6-8° 3%% Féd. 44 / mal 1 ( wfid 10 < _"° 3% Fédéral 49.. _™ _' 9° 
104. 75 3% C. F. F. 38.. 1 t) 3 _- 65 m 50 Actions Swissair.... 220 215 d B. Com. de Bâle 263 265 Ba
nque Fédérale 178 179 Union B. Suisses 900 902 Société B. Suisse 786 786 Crédit Suisse.. 797 796 d C
onli Linoléum. 216 218 Electro Watt.. 715 717 interhandel... 668 667 Motor Colombus. 509 510 S. A. E
. G. Sér. 1 66 66 Indelec.... 267 274 Italo-Suisse prior. 83 83 Réassurances.. 5650 5710 Winterthour
 Ace. 5790 5800 Zurich Assuranc. 7900 d 7990 o Aar-Tessin i.. 1180 1180 Zurich : Cou. ra du Actions 
4 5 Saurer ¦ _<,, 880 890 Aluminium B ¦ s 2075 2098 Bally... a a -, 725 725 Brown-Boveri.. 925 930 F
. Mot. Suisse C. 1360 d 1360 Fischer.,, «

# Retrieval Augmented Generation

In [19]:
# 1. Install the 'openai' library
!pip install openai -q

##USE CASE 1 - AI Assisted Source Finding

In [12]:
RAG_input_context = format_and_display_articles(result_articles[:5])
RAG_input_context = format_and_display_articles(result_articles[:5], return_string=True)

--- Article 1 ---
Title: EN SUISSE : A ZURICH, DES ÉMEUTIERS FONT METTRE EN LIBERTÉ UN PRISONNI[...]
Publicationdate: 1919-06-16
Languagecode: fr
Transcript Length: 1669 characters
Transcript:
ZURICH, -15 juin. — Des troubles assez sérieux ont éclaté à Zurich, où, comme on le sait, existe une
 minorité socialiste extrémiste des plus actives. Une grande manifestation avait été organisée, vend
redi soir, pair l'Union ouvrière zurichoise, pour honorer la mémoire de Posa Luxembourg. Le préfet d
e police de Zurich, le socialiste Greber, sur la promesse que la manifestation se déroulerait dans l
e calme, avait donné des instructions pour que la polioe.se tînt éloignée de la réunion. Celle-ci co
mmença bien; mais un orateur vint annoncer que le délégué socialiste suisse. Conrad Wyss venait, en 
rentrant d'Allemagne, d'être arrêté par les autorités fédérales, pour avoir introduit des brochures 
et des tracts de propagande. Aussi tôt la foule se porta vers la prison et réclama la mi.se en liber

In [13]:
RAG_input_context

'--- Article 1 ---\nTitle: EN SUISSE : A ZURICH, DES ÉMEUTIERS FONT METTRE EN LIBERTÉ UN PRISONNI[...]\nPublicationdate: 1919-06-16\nLanguagecode: fr\nTranscript Length: 1669 characters\nTranscript:\nZURICH, -15 juin. — Des troubles assez sérieux ont éclaté à Zurich, où, comme on le sait, existe une\n minorité socialiste extrémiste des plus actives. Une grande manifestation avait été organisée, vend\nredi soir, pair l\'Union ouvrière zurichoise, pour honorer la mémoire de Posa Luxembourg. Le préfet d\ne police de Zurich, le socialiste Greber, sur la promesse que la manifestation se déroulerait dans l\ne calme, avait donné des instructions pour que la polioe.se tînt éloignée de la réunion. Celle-ci co\nmmença bien; mais un orateur vint annoncer que le délégué socialiste suisse. Conrad Wyss venait, en \nrentrant d\'Allemagne, d\'être arrêté par les autorités fédérales, pour avoir introduit des brochures \net des tracts de propagande. Aussi tôt la foule se porta vers la prison et réclama 

### OPENROUTER VERSION

In [24]:
import openai
import getpass
import os

# 2. Securely get your API key
# When you run this, a password-style box will appear.
# Paste your OpenRouter API key there and press Enter.
if "OPENROUTER_API_KEY" not in os.environ:
  os.environ["OPENROUTER_API_KEY"] = getpass.getpass("Enter your OpenRouter API Key: ")

# 3. Configure the OpenAI client to use OpenRouter
client = openai.OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.environ["OPENROUTER_API_KEY"],
)

# 4. Set up your request
model_id = "google/gemma-3-27b-it:free"
core_input = "Summarize how the following articles that might contain OCR errors relate to the question of the user. \n"
input_with_context = core_input + " Question: " + question + "\n Retrieved Context: " + RAG_input_context + "\n"

print(f"Sending request to model: {model_id}...")

try:
  completion = client.chat.completions.create(
    model=model_id,
    messages=[
      {
        "role": "user",
        "content": sample_prompt,
      },
    ],
  )

  # 6. Print the model's response
  response_text = completion.choices[0].message.content
  print(("\n--- Core input of task ---"))
  print(core_input)
  print("\n--- Model Response ---")
  print(response_text)
  print("----------------------")

except openai.AuthenticationError:
  print("AuthenticationError: Invalid API key. Please check your OpenRouter API key.")
except Exception as e:
  print(f"An error occurred: {e}")

Sending request to model: google/gemma-3-27b-it:free...

--- Core input of task ---
Summarize how the following articles that might contain OCR errors relate to the question of the user. 


--- Model Response ---


Here's a summary of what the articles reveal about events in Zurich in 1950, and surrounding years, based on the provided text (with consideration for potential OCR errors):

*   **Article 1 (1919):** This article describes riots in Zurich in June 1919 following the arrest of a socialist delegate. While it concerns Zurich, it is *not* from 1950.

*   **Article 2 (1953):** This article lists historical events occurring in increments of 10, 20, 30, 40 and 50 years prior to October 26, 1953. It mentions that 50 years before 1953 (i.e. in 1903), Zizers (not Zurich) experienced a large fire.

*   **Article 3 (1959):** This article reports a murder in Zurich on April 24, 1959. Again, not 1950.

*   **Article 4 (1951):** This article details the financial results of the *Banque can