In [3]:
from google.colab import files

# Upload the file
uploaded = files.upload()

Saving 10q_citi_q32022.json to 10q_citi_q32022.json
Saving 10q_mg_0322.json to 10q_mg_0322.json


In [65]:
# Upload the file
uploaded_1 = files.upload()

Saving social_media.csv to social_media.csv


## 1. Preparation

In [14]:
# import relevant packages
import glob
import os
import json
import pandas as pd

# import third party packages
import torch

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain.retrievers import EnsembleRetriever
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

In [47]:
embedding_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1",
                                  encode_kwargs={'normalize_embeddings': True}
)

In [59]:
dimensions = 512
transformer_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1", truncate_dim=dimensions)

## 2. Read json

In [17]:
def json_to_df(result_dict):
    # create df with all paragraphs
    idx_list = []
    role_list = []
    content_list = []
    # page_list = []
    for idx, paragraph in enumerate(result_dict["paragraphs"]):
        idx_list.append(idx)
        role_list.append(paragraph['role'])
        content_list.append(paragraph['content'])
        # page_list.append(paragraph['bounding_regions'][0]['page_number'])
    df_paragraph = pd.DataFrame({'idx':idx_list,
                                'role':role_list,
                                'content':content_list,
                                # 'page':page_list
                                })
    return df_paragraph

For the explaination purpose, this note book is run with 1 file '10q_citi_q32022' only

In [19]:
file_name = '10q_citi_q32022'
file_path = f'{file_name}.json'

# Open the JSON file
with open(file_path, 'r') as file:
    # Load JSON data
    data = json.load(file)

# Load json file to dataframe
df_paragraph = json_to_df(data)

# Filter data to keep relevant information only
df_filtered = df_paragraph[(df_paragraph['role'] == 'sectionHeading') | (df_paragraph['role'] == 'pageNumber') | (df_paragraph['role'].isna())]
df_filtered.head(5)

Unnamed: 0,idx,role,content
1,1,,(Mark One)
2,2,,☒ :selected:
3,3,,QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(...
4,4,,"For the quarterly period ended September 30, 2..."
5,5,,:unselected: ☐


In [41]:
# Initialize variables to track current page number and section heading
current_page = None
current_heading = None

# List to collect the dictionaries
content_list = []

# Iterate over each row in the DataFrame
for idx, row in df_filtered.iterrows():
  if len(row['content'].split()) > 10:
    role = row['role']
    content = row['content']

    # Update the current page number if role is 'pageNumber'
    if role == 'pageNumber':
        current_page = row['content']
    # Update the current heading if role is 'sectionHeading'
    elif role == 'sectionHeading':
        current_heading = row['content']
    # If the role is None, add the dictionary to the list with the current page and heading
    elif role is None:
        content_temp = Document(
            page_content = content,
            metadata={
                'source' : file_name,
                'heading' : current_heading,
                'page': current_page
            }
        )
        content_list.append(content_temp)

# Display the list of dictionaries
content_list[:10]


[Document(metadata={'source': '10q_citi_q32022', 'heading': None, 'page': None}, page_content='QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934'),
 Document(metadata={'source': '10q_citi_q32022', 'heading': None, 'page': None}, page_content='For the quarterly period ended September 30, 2022 OR TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the transition period from to Commission file number 1-9924'),
 Document(metadata={'source': '10q_citi_q32022', 'heading': None, 'page': None}, page_content='Citigroup Inc. (Exact name of registrant as specified in its charter)'),
 Document(metadata={'source': '10q_citi_q32022', 'heading': None, 'page': None}, page_content="(Address of principal executive offices) (212) 559-1000 (Registrant's telephone number, including area code)"),
 Document(metadata={'source': '10q_citi_q32022', 'heading': None, 'page': None}, page_content='Securities registered pursuant to Sectio

## 3. Create database

In [51]:
# initialize the bm25 retriever and faiss retriever
bm25_retriever = BM25Retriever.from_documents(content_list, k = 5)

In [50]:
faiss_retriever = FAISS.from_documents(
    documents = content_list, embedding = embedding_model
).as_retriever(search_kwargs = {'k':10})

In [104]:
# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.8, 0.2]
)

## 4. Semantic router

A Semantic Router is a mechanism or system that routes information or data based on the meaning (semantics) of the content rather than simply using syntactic or structural rules. For example, if the users input relevant to our context (in this example is financial context) then we can continue with ourknowledge base, if not then we should not continue.

In [72]:
social_media_data = pd.read_csv('social_media.csv')
social_media = social_media_data.Text.to_list()
social_media[:5]

[' Enjoying a beautiful day at the park!              ',
 ' Traffic was terrible this morning.                 ',
 ' Just finished an amazing workout! 💪               ',
 ' Excited about the upcoming weekend getaway!        ',
 ' Trying out a new recipe for dinner tonight.        ']

In [73]:
header_list = list(df_paragraph[(df_paragraph['role'] == 'sectionHeading')]['content'].unique())
header_list[:5]

['OVERVIEW',
 'Legacy Franchises',
 'Corporate/Other',
 '· Asia Consumer Banking (Asia Consumer)',
 'Citigroup Regions(2)']

In [94]:
query = "What happen with capital resources"

In [96]:
check_list = [query] + social_media

['What happen with capital resources',
 ' Enjoying a beautiful day at the park!              ',
 ' Traffic was terrible this morning.                 ',
 ' Just finished an amazing workout! 💪               ',
 ' Excited about the upcoming weekend getaway!        ']

In [100]:
def avg_similary(query: str, ctg_list: list):
  check_list = [query] + ctg_list
  embeddings = transformer_model.encode(check_list)
  similarities = cos_sim(embeddings[0], embeddings[1:])
  average = torch.mean(similarities[0])
  return average

In [101]:
avg_social_media = avg_similary(query, social_media)
avg_financial_list = avg_similary(query, header_list)

In [102]:
if avg_financial_list > avg_social_media :
  print(True)
else:
  print(False)

True


if result is True then we continue to query in knowledge base. if not then dont

In [None]:
# test with random query
test_query = 'What is the weather like today?'

# calculate similarity score
test_avg_social_media = avg_similary(test_query, social_media)
test_avg_financial_list = avg_similary(test_query, header_list)

In [106]:
# test query
if test_avg_financial_list > test_avg_social_media :
  print(True)
else:
  print(False)

False


## 5. Query and find answer

In [109]:
answers = ensemble_retriever.invoke(query)[:5] # take top 5 only
for ans in answers:
  print(ans.page_content)


For additional information about capital resources, including Citi's capital management, regulatory capital buffers, the stress testing component of capital planning and current regulatory capital standards and developments, see "Capital Resources" and "Risk Factors" in Citi's 2021 Form 10-K.
Moreover, Citigroup Global Markets Limited, a broker- dealer registered with the United Kingdom's Prudential Regulation Authority (PRA) that is also an indirect wholly owned subsidiary of Citigroup, had total regulatory capital of $27 billion at September 30, 2022, which exceeded the PRA's minimum regulatory capital requirements.
The operating segments are determined based on how management allocates resources and measures financial performance to make business decisions, and are reflective of the types of customers served and the products and services provided.
In addition, certain of Citi's other broker-dealer subsidiaries are subject to regulation in the countries in which they do business, inc