In [3]:
import os
import requests
import torch
import json
import faiss
import torch.nn.functional as F
from langchain.vectorstores import Chroma
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

In [4]:
api_key = os.getenv("hf_api_key")
headers = {"Authorization": f"Bearer {api_key}"}

# Data Reading and Data Chunking

In [5]:
# Reading in the list of ECT in order to answer the question
texts = []
counter = 20

for ect in os.listdir("Earning Call Transcript"):
    with open("Earning Call Transcript/" + ect, 'r') as file:
        transcript = file.read()
    texts.append(transcript)
    counter -= 1
    if counter <= 0:
        break

In [261]:
file_length = 0
for ect in os.listdir("Earning Call Transcript"):
    with open("Earning Call Transcript/" + ect, 'r', encoding='utf-8') as file:
        transcript = file.read()
    file_length += len(transcript)
print(file_length / 4713)

## Having an avergae of 41666 Characters from the Earning Call transcripts

0-article-4375758-ford-motor-company-f-management-presents-evercore-isi-virtual-new-mobility-ai-forum.txt
1-article-4375756-erytech-pharma-s-eryp-ceo-gil-beyen-on-q2-2020-results-earnings-call-transcript.txt
10-article-4375497-pintec-technology-holdings-limiteds-pt-ceo-victor-li-on-q2-2020-results-earnings-call.txt
100-article-4374466-advanced-micro-devices-inc-amd-presents-deutsche-bank-2020-virtual-technology-brokers.txt
1000-article-4368452-inspired-entertainment-inc-inse-on-q2-2020-results-earnings-call-transcript.txt
1001-article-4368451-futu-holdings-limited-futu-ceo-leaf-li-on-q2-2020-results-earnings-call-transcript.txt
1002-article-4368448-k-s-aktiengesellschaft-kpluy-management-on-q2-2020-results-earnings-call-transcript.txt
1003-article-4368447-tui-ag-tuiff-ceo-friedrich-joussen-on-q3-2020-results-earnings-call-transcript.txt
1004-article-4368445-northland-power-inc-npiff-ceo-mike-crawley-on-q2-2020-results-earnings-call-transcript.txt
1005-article-4368443-thyssenkrupp-ag-ty

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 800, chunk_overlap = 100)
docs = text_splitter.create_documents(texts)
chunks = list(map(lambda x:x.page_content, docs))

In [7]:
# From this, we can deduce that the text splitter would differentiate chunks from a different document into separate all together, ensuring independence
chunks[40:50]

["like this that are mature, where you can make -- as I said earlier, commercial customer, make every minute count. So, robotic delivery is something we believe will be in the future. Today, it's in its infancy but that's why we have to work on technology today with partners and push to mature that technology so that it will be ready and we will be ready to receive an integrated solution for our customers.  So I know that you've seen our partnership with Agility Robotics, where we've taken delivery of the first two next-gen Digit robots. This is a bipedal robot that actually we used in a demonstration to take a package out of a Ford van and walk to a doorstep and deliver the package. This is not production yet but this simple demonstration that we had at CES last year gives you a glimpse",
 "not production yet but this simple demonstration that we had at CES last year gives you a glimpse that the future is coming. And so, we're also working with a number of other robotic companies, as 

In [305]:
print(len("not production yet but this simple demonstration that we had at CES last year gives you a glimpse that the future is coming. And so, we're also working with a number of other robotic companies, as well as universities and other strategic partners. We will be -- the anchor tenants of a new research facility at the University of Michigan, opening this fall, called the Ford Motor Company robotics building on the University of Michigan campus. This is a really exciting partnership we have with the University of Michigan. We will use that facility and our partnership with the University of Michigan to help us accelerate the development of next-gen technologies like advanced robotics, drone technologies, bipedal robots, rolling robots. So all of this is coming and we're right in the middle of"))

797


In [309]:
print(len("bipedal robots, rolling robots. So all of this is coming and we're right in the middle of that mix, working in the far in parallel to working in the now and the near to deliver products today, as well as develop the possibilities for tomorrow. So thanks for that question.Alright. With that, I think we'll wrap it up here. Thank you very much to Ken and the whole team at Ford. I think it was a great conversation and thanks everyone for attending.Thank you."))

458


In [311]:
print(len("Ladies and gentlemen, thank you for standing by and welcome to the ERYTECH Business Update and Financial Highlights for the Second Quarter 2020 Conference Call. At this time, all participants are in a listen-only mode. [Operator Instructions] I would now like to hand the conference call over to your speaker today, Gil Beyen, Chief Executive Officer. Please go ahead, sir.Thank you. Good afternoon, good morning [Foreign Language]. Hoping you're all well and safe and thanks for joining us for our earnings call for the second quarter of 2020 and the first half year. We announced our business and financial update yesterday evening. You should be able to access the press release and our earnings presentation on the Investors page of our website under webcast and link to slide show or via the"))

796


# Input Embeddings and Data Storage

Using `all-MiniLM-L6-v2` model to embed the model, multiple embeddings have been used to make comparison




In [8]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


### Use the CLS token's embeddings for each of the document to represent the chunk


In [313]:
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token's embeddings which is a summary of all the tokens 
    return cls_embeddings.numpy()

In [315]:
doc_embeddings_cls = get_embeddings(chunks)

d = doc_embeddings_cls.shape[1]
vector_cls = faiss.IndexFlatL2(d)
vector_cls.add(doc_embeddings_cls)

In [52]:
# Query embedding for retrieval
query = "Who is the CTO of the Ford company"
query_embedding = get_embeddings([query])

# Search in FAISS
k = 5  # number of relevant docs to retrieve
distances, indices = vector_cls.search(query_embedding, k)

# Display results
print("Top relevant documents:\n")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}. {chunks[idx]} (Distance: {distances[0][i]:.4f})\n")

Top relevant documents:

1. down to two to four modules with respect to the compute stack, and then shared memory across all of the modules. So that's what we see in the next, say, four or five years.Okay, this is John, again. Last question from us before we kick it over to some of the audience Q&A. We spoke earlier about partnerships, and talked to you about Ford's closest partnership, which is Argo AI, and Ford's AV LLC. Understand that those are not under your purview today. But what we really want to focus on is knowledge sharing. So how is Ford thinking about sharing the learnings between those groups? And if you could focus the discussion around data, this is an area where Ford should have some good overlap between the two programs.Yes, that's a really important question because the retail and commercial (Distance: 23.4028)

2. Welcome back everyone. My name is John Saager from Evercore ISI. I cover autos, alongside my bold amigo and mobility partner in crime, Chris McNally. We'r

In [316]:
def chat(query, k = 5, verbose = False):
    query_embedding = get_embeddings([query])
    distances, indices = vector_cls.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Help me to determine if the following context provided, how many of them contains the information contributed to answer generation for the
    query. And calculate the Recall@50 for me which has the following equation: Recall@50 = Number of relevant documents / 46
    This is the context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

query = "Can you summarize for me the main content in the earning call trancript for AMD"
response = chat(query, k = 50, verbose = True)

The response is:
 Based on the provided contexts, here are summaries for the segments that pertain to AMD's earnings call:

1. Reportedly, AMD has a significant ramp coming into the second half of the year, spanning multiple product lines and businesses. They have strong demand and are performing as expected, despite not being able to comment on specific products or customers. The Data Center and Server CPU markets are growth drivers for AMD, especially with the adoption of the EPYC processor line (Context 1).

8. AMD reached up to a double-digit market share (10%) in the Server CPU market, as they had promised to hit that target in the June quarter. They expanded their presence despite not announcing a new market share target (Context 7).

17. The macroeconomic situation, such as the COVID-19 pandemic, has impacted AMD's business. However, there seems to be a pull-in from work-from-home and cloud sectors, which has helped offset some of the challenges. Overall, the company maintains h

### Use mean pooling (Take into consideration of the effects of attention mask)
Recommended by the hugging face model website

**The mean pooling process is splitted into 2 parts to verify the effects of normalization in retrieval accuracy**

In [321]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def embedding_mask(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    return mean_pooling(model_output, encoded_input['attention_mask'])

In [323]:
docs_embeddings_mask = embedding_mask(chunks)
normalized_embeddings_mask = F.normalize(docs_embeddings_mask, p=2, dim=1)

In [326]:
# Using the unnormallized embeddings for the query

d = docs_embeddings_mask.shape[1]
vector_mask = faiss.IndexFlatL2(d)
vector_mask.add(docs_embeddings_mask)

# Query embedding for retrieval
query = "Who are the C-level executives in all the company"
query_embedding = embedding_mask([query])

# Search in FAISS
k = 5  # number of relevant docs to retrieve
distances, indices = vector_mask.search(query_embedding, k)

# Display results
print("Top relevant documents:\n")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}. {chunks[idx]} (Distance: {distances[0][i]:.4f})\n")

Top relevant documents:

1. the Q&A session at the end of our prepared remarks later on. Since this is my first call as the CEO of SGL Carbon, I would like to take a few minutes to share with you some information about myself, my management style and what my first thoughts are after 10 weeks at SGL Carbon. I was born and raised in the northern German city of Bremen, where I also studied and completed my PhD in chemistry. I started my professional career at Bayer as a laboratory manager, and in total I spent 10 years at Bayer in various roles, followed by another 10 years at LANXESS in different operational and administrative positions. Most recently, I was CEO of SALTIGO, a subsidiary of LANXESS AG, which manufactures precursors for pharmaceuticals and agricultural products. Overall, I have 23 years of experience in (Distance: 16.6575)

2. many people in person over the phone and also via video conference. My first impression is that SGL Carbon has many strong points. Some of which are

In [331]:
def chat(query, k = 5, verbose = False):
    query_embedding = embedding_mask([query])
    distances, indices = vector_mask.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Help me to determine if the following context provided, how many of them contains the information contributed to answer generation for the
    query. And calculate the Recall@50 for me which has the following equation: Recall@50 = Number of relevant documents / 46
    This is the context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(response.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

query = "Can you summarize for me the main content in the earning call trancript for AMD"
response = chat(query, k = 50, verbose = True)

The response is:
 Answer:
Based on the provided segments, here are the main content points from the AMD earnings call transcript:

1. **Supply Chain and Market Share**:
   - AMD has secured its supply chain, from wafers to substrates, to meet product demand.
   - The company achieved a significant milestone by reaching double-digit market share (10%) in the Server CPU market in the June quarter, a goal they had previously set.

2. **Product Roadmap and Execution**:
   - The introduction of the second-generation EPYC processors (Rome) was critical for AMD’s resurgence, offering double the performance of competitors and demonstrating AMD’s ability to deliver on promises.
   - The third-generation EPYC processors (Milan) are important for maintaining trust and continued adoption, as they further demonstrate AMD's commitment to its roadmap and performance improvements.

3. **Business Impact of COVID-19**:
   - The pandemic has had mixed effects on AMD's business, with some segments experie

In [333]:
# Using the normallized embeddings for the query

d = normalized_embeddings_mask.shape[1]
vector_mask_norm = faiss.IndexFlatL2(d)
vector_mask_norm.add(normalized_embeddings_mask)

# Query embedding for retrieval
query = "Who are the C-level executives in all the company"
query_embedding = embedding_mask([query])

# Search in FAISS
k = 5  # number of relevant docs to retrieve
distances, indices = vector_mask.search(query_embedding, k)

# Display results
print("Top relevant documents:\n")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}. {chunks[idx]} (Distance: {distances[0][i]:.4f})\n")

Top relevant documents:

1. the Q&A session at the end of our prepared remarks later on. Since this is my first call as the CEO of SGL Carbon, I would like to take a few minutes to share with you some information about myself, my management style and what my first thoughts are after 10 weeks at SGL Carbon. I was born and raised in the northern German city of Bremen, where I also studied and completed my PhD in chemistry. I started my professional career at Bayer as a laboratory manager, and in total I spent 10 years at Bayer in various roles, followed by another 10 years at LANXESS in different operational and administrative positions. Most recently, I was CEO of SALTIGO, a subsidiary of LANXESS AG, which manufactures precursors for pharmaceuticals and agricultural products. Overall, I have 23 years of experience in (Distance: 16.6575)

2. many people in person over the phone and also via video conference. My first impression is that SGL Carbon has many strong points. Some of which are

In [335]:
def chat(query, k = 5, verbose = False):
    query_embedding = F.normalize(embedding_mask([query]), p = 2, dim =1)
    distances, indices = vector_mask_norm.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Help me to determine if the following context provided, how many of them contains the information contributed to answer generation for the
    query. And calculate the Recall@50 for me which has the following equation: Recall@50 = Number of relevant documents / 46
    This is the context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

query = "Can you summarize for me the main content in the earning call trancript for AMD"
response = chat(query, k = 50, verbose = True)

The response is:
 Answer:
Based on the provided snippets, I can summarize the key points from the earning call transcript for AMD:

1. **Supply Chain and Market Share**: AMD has successfully lined up its supply chain, from wafers to substrates, to meet its goals. The company has achieved a significant market share milestone, with double-digit market share in the server CPU market, particularly with its EPYC processors. The introduction of the second-generation EPYC processors (Rome) has been crucial, offering double the performance of competitors and reinforcing customer confidence in AMD's execution and reliability.

2. **Product Roadmap and Execution**: The Zen CPU Core Roadmap, introduced in 2017, has been a critical factor in AMD's resurgence. The Zen architecture provided a significant improvement in performance, which has driven AMD's success. The company's focus on consistent execution and meeting roadmaps has built trust with customers and the market.

3. **Future Outlook and S

### Use Sentence Transformers to obtain embeddings

In [27]:
st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
st_embeddings = st_model.encode(chunks)


In [28]:
d = st_embeddings.shape[1]
vector_st = faiss.IndexFlatL2(d)
vector_st.add(st_embeddings)

# Query embedding for retrieval
query = "Who are the C-level executives in all the company"
query_embedding = st_model.encode([query])

# Search in FAISS
k = 5  # number of relevant docs to retrieve
distances, indices = vector_st.search(query_embedding, k)

# Display results
print("Top relevant documents:\n")
for i, idx in enumerate(indices[0]):
    print(f"{i + 1}. {chunks[idx]} (Distance: {distances[0][i]:.4f})\n")

Top relevant documents:

1. the Q&A session at the end of our prepared remarks later on. Since this is my first call as the CEO of SGL Carbon, I would like to take a few minutes to share with you some information about myself, my management style and what my first thoughts are after 10 weeks at SGL Carbon. I was born and raised in the northern German city of Bremen, where I also studied and completed my PhD in chemistry. I started my professional career at Bayer as a laboratory manager, and in total I spent 10 years at Bayer in various roles, followed by another 10 years at LANXESS in different operational and administrative positions. Most recently, I was CEO of SALTIGO, a subsidiary of LANXESS AG, which manufactures precursors for pharmaceuticals and agricultural products. Overall, I have 23 years of experience in (Distance: 1.0994)

2. many people in person over the phone and also via video conference. My first impression is that SGL Carbon has many strong points. Some of which are 

In [339]:
def chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Help me to determine if the following context provided, how many of them contains the information contributed to answer generation for the
    query. And calculate the Recall@50 for me which has the following equation: Recall@50 = Number of relevant documents / 46
    This is the context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

query = "Can you summarize for me the main content in the earning call trancript for AMD"
response = chat(query, k = 50, verbose = True)

The response is:
 Answer:
Based on the provided snippets, I can summarize the key points from the earning call transcript for AMD:

1. **Supply Chain and Market Share**: AMD has successfully lined up its supply chain, from wafers to substrates, to meet its goals. The company has achieved a significant market share milestone, with double-digit market share in the server CPU market, particularly with its EPYC processors. The introduction of the second-generation EPYC processors (Rome) has been crucial, offering double the performance of competitors and reinforcing customer confidence in AMD's execution and reliability.

2. **Product Roadmap and Execution**: The Zen CPU Core Roadmap, introduced in 2017, has been a critical factor in AMD's resurgence. The Zen architecture provided a significant improvement in performance, which has driven AMD's success. The company's focus on consistent execution and meeting roadmaps has built trust with customers and the market.

3. **Future Outlook and S

# Prompt Engineering and Answer Generation

In [154]:
model_id = "Qwen/Qwen2.5-72B-Instruct"
API_URL = f"https://api-inference.huggingface.co/models/{model_id}"

In [156]:
# Use sentence transformers to embed all the text for the answer Generation
def query_chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Answer the question based only on the following context:\n{context}\nAnswer the question based on the above context:\n{query}"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    print(f"The response is:\n {response.json()[0]["generated_text"]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

### Query 1

In [158]:
query_1 = "What are the list of companies being mentioned inside the documents"
response_1 = query_chat(query_1, k = 20)

The response is:
 Answer the question based only on the following context:/n1. we'll open the call for questions. Our second quarter results were issued yesterday evening and a slide presentation is available on the Company's website at www.wisgrp.com. If you turn to Slide 2 in the deck, I will review the Safe Harbor statement. During this call, we may make some forward-looking statements during the formal discussion, as well as during the Q&A session. These statements apply to future events, which are subject to risks and uncertainties, as well as other factors that could cause actual results to differ materially from what is stated here today. These risks and uncertainties and other factors are provided in the earnings release and slides, as well as with other documents filed with the SEC. You can find all these documents on our website or at www.sec.gov. During
2. After their prepared remarks, we will open up the call for questions. If you do not have a copy of today's earnings rele

In [186]:
generated_text = response_1.json()[0]['generated_text']
input = "inside the documents?"
print(generated_text[generated_text.index(input) + len(input) +2:])

Based on the provided text, the companies that have been mentioned are:

1. **Wisgrp** - Mentioned in the context of the company's website for accessing the second quarter results.
2. **Pintec** - The CEO and CFO of Pintec are mentioned as participants in the call.
3. **thyssenkrupp** - The company's Q3 numbers are being discussed.
4. **JD.com** - Mentioned in the context of Hong Kong IPOs.
5. **NetEase** - Mentioned in the context of Hong Kong IPOs.
6. **T. Rowe Price, Franklin Templeton, and Amundi** - Reputable mutual fund managers mentioned in a partnership context.
7. **Unique Fabricating** - The company's quarterly report is mentioned.
8. **ERYTECH** - The company is discussed in the context of an At The Market (ATM) program and convertible notes.
9. **Hut 8** - Mentioned in the context of a hosting service.
10. **Northland** - The company's financial flexibility and global development footprint are discussed.
11. **Cowen and Company** - Mentioned as a sales agent for ERYTECH’s A

### Query 2

In [164]:
def query_chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Answer the question based only on the following context:\n{context}\nAnswer the question based on the above context:\n{query}\nOnly return the company name and nothing else"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    print(f"The response is:\n {response.json()[0]["generated_text"]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response
query_2 = "What are the list of companies being mentioned inside the documents"
response_2 = query_chat(query_2, k = 20)

The response is:
 Answer the question based only on the following context:
1. we'll open the call for questions. Our second quarter results were issued yesterday evening and a slide presentation is available on the Company's website at www.wisgrp.com. If you turn to Slide 2 in the deck, I will review the Safe Harbor statement. During this call, we may make some forward-looking statements during the formal discussion, as well as during the Q&A session. These statements apply to future events, which are subject to risks and uncertainties, as well as other factors that could cause actual results to differ materially from what is stated here today. These risks and uncertainties and other factors are provided in the earnings release and slides, as well as with other documents filed with the SEC. You can find all these documents on our website or at www.sec.gov. During

2. After their prepared remarks, we will open up the call for questions. If you do not have a copy of today's earnings rele

In [184]:
text = response_2.json()[0]["generated_text"]
input = "nothing else."
print(text[text.index(input) + len(input) +2:])

Pintec
thyssenkrupp
ERYTECH
Unique Fabricating
Hut 8
Northland
T. Rowe Price
Franklin Templeton
Amundi
JD.com
NetEase [Indiscernible] [Indiscernible] Note: The last two entries are not clearly identified in the context and may not be proper company names.
Ganzhou Aixin

Note: [Indiscernible] companies may not be valid company names and are included as per the text. Please verify these entries. 

To provide a precise list, based on clear mentions:

Pintec
thyssenkrupp
ERYTECH
Unique Fabricating
Hut 8
Northland
T. Rowe Price
Franklin Templeton
Amundi
JD.com
NetEase
Ganzhou Aixin To provide a precise list of companies mentioned in the documents, based on clear and identifiable names:

1. Pintec
2. thyssenkrupp
3. ERYTECH
4. Unique Fabricating
5. Hut 8
6. Northland
7. T. Rowe Price
8. Franklin Templeton
9. Amundi
10. JD.com
11. NetEase
12. Ganzhou Aixin

The entries marked as [Indiscernible] could not be clearly identified and are not included in the list.


### Query 3

In [195]:
def query_chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Answer the question based only on the following context:\n{context}\nAnswer the question based on the above context:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response
query_3 = "List all the C-level Executives"
response_3 = query_chat(query_3, k = 20)

The response is:
 Based on the given context, the C-level Executives are:

1. Torsten Derr - CEO of SGL Carbon
2. Michael Majerus - CFO of SGL Carbon
3. Tim Höttges - CEO of Deutsche Telekom
4. Christian Illek - CFO of Deutsche Telekom
5. Lorne Weil - Executive Chairman of a company (not specified the company name, but it is likely referring to the company where Lorne Weil is presenting the earnings call)
6. Brooks Pierce - A C-level executive, likely COO or similar, from a company (same as Lorne Weil)
7. Stewart Baker - Another C-level executive from a company (same as Lorne Weil)
8. Daniel Yuan - Chief of Staff and Head of IR at Futu Holdings
9. Leaf Li - Chairman and Chief Executive Officer of Futu Holdings
10. Arthur Chen - Chief Financial Officer of Futu Holdings
11. Robin Xu - Senior Vice President at Futu Holdings
12. Tracy Pagliara - President and CEO of Williams Industrial Services Group
13. Randy Lay - Senior Vice President and CFO of Williams Industrial Services Group
14. Gu

### Query 4

In [193]:
# Try to ask specific question that doesn't exist in the vector database
query_4 = "What are the 5 highlights being mentioned in the KLA Corporation Fourth Quarter Earning Call Transcript"
response_4 = query_chat(query_4, k = 5)

The response is:
 e provided context does not contain any specific information related to the KLA Corporation Fourth Quarter Earning Call Transcript. It appears that the context provided is from different earnings call transcripts from Teekay Corporation, Williams Industrial Services Group, and Unique Fabricating, but not from KLA Corporation. Therefore, there are no 5 highlights from the KLA Corporation Fourth Quarter Earning Call Transcript available in the given context. If you have specific details or transcripts from the KLA Corporation call, please share them, and I'll be happy to help identify the highlights.



### Query 5

In [200]:
# A more general question that is not included inside the context
query_5 = "Who is the KLA CEO"
response_5 = query_chat(query_5, k = 5)

The response is:
 The provided context does not mention the CEO of KLA. The context primarily discusses the restructuring plans of K+S and the new CEO of SGL Carbon, but it does not provide information about KLA's CEO. Therefore, we cannot accurately answer your question based on the given context. If you need information about KLA's CEO, you may want to look up the latest news or official press releases from KLA. However, based on the information provided, we can identify that the new CEO of SGL Carbon has a background in chemistry and experience in the chemical industry, having worked at Bayer and LANXESS before joining SGL Carbon. 
But to directly answer your question: the context does not contain information about the CEO of KLA.



### Query 6 

In [218]:
def query_chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Answer the question based on the following context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response
query_6 = "Who is the KLA CEO"
response_6 = query_chat(query_6, k = 20, verbose = True)

The response is:
 Based on the information provided in the given text, the CEO of KLA is not mentioned. Therefore, I cannot provide an answer to the question. However, the context provided refers to a conference call with SGL Carbon where Torsten Derr, the CEO, is leading the call. If the question about KLA's CEO is relevant to this document, additional information is required. 

However, if you are asking for the CEO of SGL Carbon, the answer is:

Torsten Derr. 

For accurate information about the CEO of KLA, you may need to refer to the latest press releases or official company sources. 

Is there any specific text or context you are looking at for KLA's CEO? Please provide more details if needed. 

If you are asking for KLA specifically, the CEO of KLA as of the most recent information (which might not be in the provided context) is:

Rick Wallace. 

Would you like more details about Rick Wallace or KLA?

The answer is based on the context:
1. in difficult market conditions.  The Eu

### Query 7

In [243]:
def query_chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""Answer the question based on the following context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(response.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

In [216]:
query_7 = "What is the Level 2, Level 3 and Level 4 Ford Motor refers to"
response_7 = query_chat(query_7, k = 10, verbose = True)

The response is:
 Answer:
In the context of Ford Motor and autonomous driving, different levels of automation from Level 2 to Level 4 are distinguished by the extent of a vehicle's ability to manage driving tasks and the role of the driver. Here’s a breakdown:

### Level 2 (Advanced Driver Assistance Systems - ADAS):
- **Description**: The vehicle can perform some driving tasks, such as steering, accelerating, and braking, but the driver must remain engaged and monitor the driving environment at all times.
- **Example**: Ford’s Mustang Mach-E with Co-Pilot 360 and Active Drive Assist. These systems allow for advanced hands-off, eyes-off capabilities on certain highways but still require the driver to be attentive and ready to take control.

### Level 3 (Conditional Automation):
- **Description**: The vehicle can handle most driving tasks in specific conditions, such as on highways. However, the driver must be ready to take control if the system cannot handle a situation.
- **Key Challe

### Query 8

In [239]:
# We ask the question that's not directly addressed in the earning call transcript. Surprisingly, the text is able to directly identify the missing of 
# information and give us the correct context. This could be largely attributed to the adaptive generative AI being employed. It could tell that 
# certain parts follow after another and provide a coherent answer to the question. However, the retrieval system doesn't seems to necessarily 
# retrieving all the consecutives chunks that are relevant to the question posed. This could be improved by maybe prompt engineering
query_8 = "It took AMD 6 quarters to grow from 5% to 10% market share. How do the representatives from AMD think the slope of share gain curve will be in the future?"
response_8 = query_chat(query_8, k = 25, verbose = True)

The response is:
 Answer:
The representatives from AMD are not willing to set new benchmarks with fine granularity or provide a slope for the share gain curve. They do aspire over time to exceed their historical high water mark, which was around 26% to 27% unit share, and they believe they have the roadmap to achieve this. However, they are primarily focused on leveraging the momentum they have built with their ecosystem, partners, and customers through the first and second generations of EPYC, and continuing to build on that. They aren't giving a specific timeframe or intermediate milestones for future share gains.

The answer is based on the context:
1. industry was your analyst meeting back in early March and no such number was given at that time. But I did want to just get your thoughts on the market share side of things.   It took you about six quarters or a year and a half to go from the 5% to the 10% market share. How do you think the slope of that share gain curve looks going f

### Query 9

In [245]:
# Many irrelevant chunks appear in the context
query_9 = "Can you summarize for me the main content in the earning call transcript for AMD"
response_9 = query_chat(query_9, k = 50, verbose = True)

The response is:
 Answer:
The earning call transcript for AMD highlighted the company's performance, particularly in the server CPU market, which has been a significant growth driver due to the traction of the EPYC processors. Key points included:

1. **Supply Chain and Planning:** AMD emphasized that they have a robust supply chain from wafers to substrates, and they are well-planned to meet demand.

2. **Market Share Growth:** The company celebrated reaching double-digit market share in the Server CPU market, attributed to the success of the EPYC processors.

3. **Rome and Milan:** The second-generation EPYC (Rome) was critical for re-establishing AMD in the market, offering high performance and predictability in execution. Milan, the next generation, is anticipated to further cement AMD's reputation for delivering on its roadmap and performance commitments.

4. **Customer Confidence:** AMD’s consistent execution and high-performance offerings have given customers the confidence to i

In [249]:
# In comparison to the summarize being done by the same model using the full text (Check for its efficiency)
with open("Earning Call Transcript/100-article-4374466-advanced-micro-devices-inc-amd-presents-deutsche-bank-2020-virtual-technology-brokers.txt") as file:
    amd_text = file.read()
input = f"""Answer the question based on the following context:\n{amd_text}\nQuestion:\n{query_9}\n"""
data = {"inputs": input}
response = requests.post(API_URL, headers=headers, json=data)
if type(response.json()) != list:
        print(response.json())
text  = response.json()[0]["generated_text"]
print(f"The response is:\n {text[text.index(input) + len(input):]}\n")

The response is:
 The transcript from the 2020 Deutsche Bank Technology Conference features a discussion between Ross Seymore, a Semiconductor Analyst at Deutsche Bank, and Forrest Norrod, SVP and GM of the Datacenter and Embedded Business Solutions Group at Advanced Micro Devices (AMD). Key points from the conversation are as follows:

### General Business Overview
- **Semi-Custom Business**: Forrest commented that AMD has a strong ramp in the second half of 2020 across multiple product lines, with demand remaining strong and the ramp going as expected, though he couldn't provide specifics about any particular product or customer.

### Impact of COVID-19
- **Productivity**: Despite initial concerns, productivity has either remained stable or increased. Engineers and teams have adapted to remote work, maintaining project timelines and product launches.
- **Demand**: The pandemic has accelerated IT transformation, driving increased demand for infrastructure both in the cloud and on-prem

### Query 10

In [251]:
def query_chat(query, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_st.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""You are an AI assistant specialized in financial analysis. Chunks of the earnings call transcript will be provided as the sole information.
    Each individual Earning Call Transcript of a company is split into multiple chunks with 100 characters overlapping. 
    Mention explicitly if the answer to the question doesn't exist in the context or if the information is insufficient. 
    Answer the question based on the following context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

In [255]:
query_10 = "Can you summarize for me the main content in the earning call trancript for AMD"
response_10 = query_chat(query_10, k = 50, verbose = True)

The response is:
 Answer:
Based on the provided context, the main points from the AMD earnings call transcript include:

1. **Supply Chain Confidence:** The company is confident in its supply chain from wafers to substrates, ensuring they can meet their production goals.

2. **Server CPU Market Share:** AMD has achieved double-digit market share in the Server CPU market, hitting around 10% in the June quarter. This achievement is a significant milestone, with the company expected to continue its growth trajectory.

3. **Product Performance and Customer Trust:** The second-generation EPYC processors, known as Rome, have been crucial in building customer trust and confidence. These processors offer double the performance of competitors' offerings and demonstrate AMD's ability to deliver on its promises.

4. **Strategic Execution:** The company emphasizes the importance of predictable execution, which has given customers confidence to invest in AMD's infrastructure and optimize for its mi

### Query 11

In [277]:
# Using medium size chunks to split the documents (Around 10 chunks)
medium_text_splitter = RecursiveCharacterTextSplitter(chunk_size = 4000, chunk_overlap = 100)
medium_docs = medium_text_splitter.create_documents(texts)
medium_chunks = list(map(lambda x:x.page_content, medium_docs))
st_embeddings = st_model.encode(medium_chunks)
d = st_embeddings.shape[1]
medium_vector_st = faiss.IndexFlatL2(d)
medium_vector_st.add(st_embeddings)

In [284]:
def query_chat(query, vector_db, chunks, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_db.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""You are an AI assistant specialized in financial analysis. Chunks of the earnings call transcript will be provided as the sole information.
    Each individual Earning Call Transcript of a company is split into multiple chunks with 100 characters overlapping. 
    Mention explicitly if the answer to the question doesn't exist in the context or if the information is insufficient. 
    Answer the question based on the following context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

response_11 = query_chat(query_10, k = 10, verbose = True, vector_db = medium_vector_st, chunks = medium_chunks)

The response is:
 Answer:
The provided passage does not contain a complete earnings call transcript for AMD, but it does include a segment where an executive is answering questions about AMD's strategic direction and market opportunities. Key points include:

1. **Interest in ARM in Data Centers**: The executive acknowledges long-standing interest in ARM architecture for data centers but emphasizes AMD's focus on providing high-performance CPU and GPU solutions. They believe this focus will keep x86 as the dominant architecture in the core market.

2. **GPU as a Growth Driver**:
   - **VDI and Remote Rendering**: AMD sees significant potential in virtual desktop infrastructure (VDI) and remote rendering, especially with the increase in work-from-home trends post-COVID.
   - **HPC**: They are focusing on traditional high-performance computing (HPC) applications such as scientific and medical research.
   - **Machine Learning and AI**: AMD views machine learning and artificial intelligen

### Query 12

In [291]:
# Using medium size chunks to split the documents (Around 10 chunks)
large_text_splitter = RecursiveCharacterTextSplitter(chunk_size = 20000, chunk_overlap = 1000)
large_docs = large_text_splitter.create_documents(texts)
large_chunks = list(map(lambda x:x.page_content, large_docs))
st_embeddings = st_model.encode(large_chunks)
d = st_embeddings.shape[1]
large_vector_st = faiss.IndexFlatL2(d)
large_vector_st.add(st_embeddings)

In [295]:
def query_chat(query, vector_db, chunks, k = 5, verbose = False):
    query_embedding = st_model.encode([query])
    distances, indices = vector_db.search(query_embedding, k)
    context = ""
    for i, idx in enumerate(indices[0]):
        context += f"{i + 1}. {chunks[idx]}\n\n"
    input = f"""You are an AI assistant specialized in financial analysis. Chunks of the earnings call transcript will be provided as the sole information.
    Each individual Earning Call Transcript of a company is split into multiple chunks with 100 characters overlapping. 
    Mention explicitly if the answer to the question doesn't exist in the context or if the information is insufficient. 
    Answer the question based on the following context:\n{context}\nQuestion:\n{query}\n"""
    data = {"inputs": input}
    response = requests.post(API_URL, headers=headers, json=data)
    if type(response.json()) != list:
        print(reponse.json())
    text  = response.json()[0]["generated_text"]
    print(f"The response is:\n {text[text.index(input) + len(input):]}\n")
    if verbose:
        print(f"The answer is based on the context:\n{context}")
    return response

response_12 = query_chat(query_10, k = 4, verbose = True, vector_db = large_vector_st, chunks = large_chunks)

The response is:
 Answer:
The provided transcript segments do not contain information from an AMD earnings call. The provided text includes segments from earnings calls for Pintec Technology Holdings Limited, MSG Networks, Unique Fabricating, and Teekay Corporation. If you have a specific part of an AMD earnings call you wish to be summarized, please provide that segment. Otherwise, I can summarize one of the provided companies' earnings calls. Let me know how you would like to proceed. 

If you would like a summary of one of the companies mentioned, please specify which company's earnings call you are interested in. For example:

- Pintec Technology Holdings Limited
- MSG Networks
- Unique Fabricating
- Teekay Corporation

I will be happy to summarize the content for you.

The answer is based on the context:
1. Good morning and good evening everyone. Thank you for standing by and welcome to Pintec Technology Holdings Limited's First Half 2020 Earnings Call. At this time, all participa