In [1]:
import requests
from dotenv import load_dotenv
import os
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

load_dotenv('../.env')
rapidapi_key = os.getenv('X-RapidAPI-Key')
open_ai_key = os.getenv('OPENAI')


In [None]:
url = "https://everyearthquake.p.rapidapi.com/earthquakesByDate"

querystring = {"startDate":"2021-01-01",
               "endDate":"2023-12-10",
               "start":"1","count":"100",
               "type":"earthquake",
               "latitude":"33.962523",
               "longitude":"-118.3706975",
               "radius":"1000",
               "units":"miles"
               ,"magnitude":"3",
               "intensity":"1"}

headers = {
	"X-RapidAPI-Key": rapidapi_key,
	"X-RapidAPI-Host": "everyearthquake.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)


In [None]:
import json
data=response.json()['data']
with open('earthquakes.json', 'w') as f:
    json.dump(data, f)

In [None]:
import json
with open('earthquakes.json', 'r') as f:
    data_r = json.load(f)

In [None]:
import pandas as pd 
df = pd.json_normalize(data_r, record_path=None)

In [None]:
from src.jsonconverter import JsonToDocument

converter = JsonToDocument(content_field="title")
results = converter.run(sources=["earthquakes.json"])
documents = results["documents"]

## Build Indexing pipeline

In [2]:
from src.jsonconverter import JsonToDocument

from haystack import Pipeline
from haystack.document_stores import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter


document_store = InMemoryDocumentStore()
converter = JsonToDocument(content_field="title", flatten_field=None, one_doc_per_row=True)
embedder = SentenceTransformersDocumentEmbedder()
writer = DocumentWriter(document_store=document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", converter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("converter", "embedder")
indexing_pipeline.connect("embedder", "writer")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
indexing_pipeline.run(data={"converter":{"sources": ["earthquakes.json"]}})

Converting JSON files to Documents: 100%|██████████| 1/1 [00:00<00:00, 30.68it/s]
Batches: 100%|██████████| 4/4 [00:03<00:00,  1.15it/s]


{'writer': {'documents_written': 100}}

In [4]:
document_store.filter_documents()

[Document(id=fee6a5687926e2bebd63b4e8f3a0d32883e087815ed3ae9d6190e2f693c6b9d9, content: 'M 3.7 - 25 km SSW of Los Banos, CA', meta: {'id': 'nc73972711', 'magnitude': '3.7', 'type': 'earthquake', 'title': 'M 3.7 - 25 km SSW of Los Banos, CA', 'date': '2023-12-07T06:16:51', 'time': '1701929811830', 'updated': '1702005347995', 'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/nc73972711', 'detailUrl': 'https://earthquake.usgs.gov/earthquakes/feed/v1.0/detail/nc73972711.geojson', 'felt': '68', 'cdi': '3', 'mmi': '4', 'alert': '', 'status': 'reviewed', 'tsunami': '0', 'sig': '234', 'net': 'nc', 'code': '73972711', 'ids': ',ew1701929810,nc73972711,us7000lgur,', 'sources': ',ew,nc,us,', 'types': ',dyfi,focal-mechanism,nearby-cities,origin,phase-data,scitech-link,shake-alert,shakemap,', 'nst': '151', 'dmin': '0.1422', 'rms': '0.2', 'gap': '25', 'magType': 'ml', 'geometryType': 'Point', 'depth': '6.53', 'latitude': '36.85', 'longitude': '-120.976', 'place': '25 km SSW of Los Banos, CA',

### Question and Answering pipeline

In [5]:

from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import GPTGenerator

######## Complete this section #############
prompt_template = """\
Use the following context to answer the user's question. If the context provided doesn't answer the question - please respond with: "I don't know".

### CONTEXT
{% for doc in documents %}
  {{ doc.content }}
{% endfor %}

### USER QUESTION
{{query}}


"""
prompt_builder = PromptBuilder(prompt_template)
############################################
query_embedder = SentenceTransformersTextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=100)
llm = GPTGenerator(api_key=open_ai_key)

In [6]:
pipeline = Pipeline()
pipeline.add_component(instance=query_embedder, name="query_embedder")
pipeline.add_component(instance=retriever, name="retriever")
pipeline.add_component(instance=prompt_builder, name="prompt_builder")
pipeline.add_component(instance=llm, name="llm")

pipeline.connect("query_embedder.embedding", "retriever.query_embedding")
pipeline.connect("retriever.documents", "prompt_builder.documents")
pipeline.connect("prompt_builder", "llm")

In [7]:
query = "Where did the earthquake happen?"
result = pipeline.run(data={"query_embedder": {"text": query}, "prompt_builder": {"query": query}})
print(result['llm']['replies'][0])

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.32s/it]


The earthquakes in the context happened in various locations including Ocotillo, CA; Humboldt Hill, CA; Ferndale, CA; Fontana, CA; Fullerton, CA; Pacific Grove, CA; Burney, CA; Fort Ross, CA; Imperial, CA; Big Bear City, CA; San Clemente Is. (SE tip), CA; Bodfish, CA; Big Pine, CA; Isleton, CA; Emiliano Zapata, Mexico; Coyanosa, Texas; The Geysers, CA; Segundo, Colorado; Toyah, Texas; Lima, Montana; Stanley, Idaho; Walker, CA; Petrolia, CA; Woodruff, Utah; Jal, New Mexico; Stanton, Texas; Stanwood, Washington; Ackerly, Texas; Kanarraville, Utah; Burley, Washington; Millbrae, CA; Weston, Colorado; and Nevada.


In [8]:
query = "Identify entries in the documents for earthquakes with a magnitude of 5 or greater? Provide the date,location, and magnitude."
result = pipeline.run(data={"query_embedder": {"text": query}, "prompt_builder": {"query": query}})
print(result['llm']['replies'][0])

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.61it/s]


I don't know.


In [None]:
query = "How many earthquakes"
result = pipeline.run(data={"query_embedder": {"text": query}, "prompt_builder": {"query": query}})
print(result['llm']['replies'][0])

## Compare against the dataframe

In [None]:
df