In [None]:
%pip install datasets pandas pymongo sentence_transformers
%pip install -U transformers
%pip install accelerate

In [None]:
# Load Dataset
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("hhe1ibeb/xinyi_geodata")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

In [None]:
# Data Preparation

# Remove data point where plot coloumn is missing
dataset_df = dataset_df.dropna(subset="description")
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

dataset_df.head(5)

In [10]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding_en(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

  from tqdm.autonotebook import tqdm, trange


In [None]:
dataset_df["embedding-en"] = dataset_df["description"].apply(get_embedding_en)

dataset_df.head()

In [15]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large-zh
embedding_model = SentenceTransformer("thenlper/gte-large-zh")


def get_embedding_zh(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

In [None]:
dataset_df["embedding-zh"] = dataset_df["descriptions-mandarin"].apply(get_embedding_zh)

dataset_df.head()

## Connect to MongoDB

In [1]:
%env MONGO_URI=mongodb+srv://hhe1ibeb:idbG7LqUV1ZButg9@xinyigeosearch.mlyr8or.mongodb.net/?retryWrites=true&w=majority&appName=XinyiGeoSearch"

env: MONGO_URI=mongodb+srv://hhe1ibeb:idbG7LqUV1ZButg9@xinyigeosearch.mlyr8or.mongodb.net/?retryWrites=true&w=majority&appName=XinyiGeoSearch"


In [3]:
import pymongo
import os

def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = os.getenv("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

Connection to MongoDB successful


In [4]:

# Ingest data into MongoDB
db = mongo_client["xinyi_geodata"]
collection = db["collection_1"]
# Delete any existing records in the collection
# collection.delete_many({})

In [None]:
documents = dataset_df.to_dict("records")
# collection.insert_many(documents)

# print("Data ingestion into MongoDB completed")

In [6]:
def vector_search_en(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding_en(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding-en",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "lat": 1,  # Include the lat field
                "lon": 1,  # Include the lon field
                "description": 1,  # Include the description field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [7]:
def get_search_result(query, collection):

    get_knowledge = vector_search_en(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += f"Lat: {result.get('lat', 'N/A')}, Lon: {result.get('lon', 'N/A')}, Description: {result.get('description', 'N/A')}\n"

    return search_result

In [11]:
# Conduct query with retrival of sources
query = "I want to find a place where I get a lot of stores nearby"
source_information = get_search_result(query, collection)
combined_information = (
    f"Query: {query}\nAccording to the results, suggest the best place in response to the query:\n{source_information}."
)

print(combined_information)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Query: I want to find a place where I get a lot of stores nearby
According to the results, suggest the best place in response to the query:
Lat: 25.045736666264823, Lon: 121.57837663001078, Description: - **Traffic:**
  - The area appears to be a pedestrian shopping corridor, with narrow lanes likely limiting vehicular access.
  - Foot traffic seems high with many shoppers, indicating lively commerce but potentially crowded spaces.

- **Attractions:**
  - Numerous clothing stores with a wide variety of apparel.
  - Prominent signage and inviting displays aimed at attracting shoppers.
  - Potentially part of a larger market or shopping district offering diverse retail options.

- **Overall Neighborhood Atmosphere:**
  - Vibrant and bustling with shoppers and a densely packed retail environment.
  - Energetic and commercial ambiance, ideal for those who enjoy a dynamic and lively setting.
  - Traditional market feel with closely situated stores, suggesting a well-established retail area 

In [12]:
def vector_search_zh(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding_zh(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index_zh",
                "queryVector": query_embedding,
                "path": "embedding-zh",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "lat": 1,  # Include the lat field
                "lon": 1,  # Include the lon field
                "descriptions-mandarin": 1,  # Include the description field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [13]:
def get_search_result_zh(query, collection):

    get_knowledge = vector_search_zh(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += f"Lat: {result.get('lat', 'N/A')}, Lon: {result.get('lon', 'N/A')}, Description: {result.get('descriptions-mandarin', 'N/A')}\n"

    return search_result

In [16]:
# Conduct query with retrival of sources
query = "我想要找一個商店很多的地方"
source_information = get_search_result_zh(query, collection)
combined_information = (
    f"Query: {query}\nAccording to the results, suggest the best place in response to the query:\n{source_information}."
)

print(combined_information)

Query: 我想要找一個商店很多的地方
According to the results, suggest the best place in response to the query:
Lat: 25.045736666264823, Lon: 121.57837663001078, Description: - **交通：**
  - 這個地區看起來是一條行人購物區，狹窄的巷道可能限制了車輛進入。
  - 人流量很高，有很多購物者，顯示出蓬勃的商業活動，但可能有擁擠的空間。

- **景點：**
  - 多家服裝店，提供各種各樣的服飾。
  - 顯眼的招牌和吸引人的陳列，旨在吸引購物者。
  - 可能是一個更大的市場或購物區的一部分，提供多樣化的零售選擇。

- **整體社區氛圍：**
  - 熱鬧而繁忙，有很多購物者和密集的零售環境。
  - 充滿活力和商業氛圍，非常適合喜歡充滿活力和熱鬧場所的人。
  - 傳統市場的感覺，店鋪緊密相鄰，表明這是一個擁有各種本地商家的成熟零售區域。
Lat: 25.041046696913387, Lon: 121.56666076376725, Description: - **交通：**
  - 這個地區的道路寬敞，車道多條，顯示交通順暢。
  - 有行人過街處，提升了步行便利性。
  - 車輛和機車的存在較為普遍，顯示典型的城市交通情況。

- **景點：**
  - 有一個知名的購物中心，以大型的「Breeze」標誌為識別，表明有大型零售空間。
  - 附近有高樓大廈，表示辦公樓和可能的住宅單元混合存在。
  - 街道級別有一些小型商業和零售場所，滿足日常需求。

- **整體社區氛圍：**
  - 繁華的城市氛圍，呈現都市繁忙的感覺。
  - 現代高樓大廈與舊建築結合，提供了當代與傳統建築的融合。
  - 可能會是一個充滿活力的區域，提供各種設施，包括購物、餐飲和商業服務。
  - 街道整潔維護良好，顯示組織有序且活力四射的都市環境。
Lat: 25.04623, Lon: 121.57837, Description: - **交通：**
  - 此地區呈狹窄的小巷，車輛通行受限。
  - 適合行人交通，不適合車輛通行。

- **景點：**
  - 圖片顯示店面關閉，可能是營業時間外或商業活動衰退的跡象。
 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to('cuda')
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))