In [None]:
!pip install -U  openai lancedb
!pip install langchain-experimental langchain-openai langchain
!pip install sentence-transformers



In [None]:
# load the data using pandas
import pandas as pd
import os
import time
import lancedb
from lancedb.embeddings import EmbeddingFunctionRegistry
import getpass
import pandas as pd
from lancedb.pydantic import LanceModel, Vector

df = pd.read_csv("/content/hs_code_india_6digi_best.csv")


# Set the OPENAI_API_KEY environment variable
os.environ["OPENAI_API_KEY"] = "sk-proj-"

In [None]:
df.head()

Unnamed: 0,HSCode,Commodity,Year_2022_2023,Share_Percentage_22_23,Year_2023_2024,Share_Percentage_23_24
0,100111,DURUM WHEAT: SEED,4.91,0.0,0.0,0.0
1,100119,OTHER DURUM WHEAT EXCL SEED,29.41,0.0,0.0,0.0
2,100191,WHEAT AND MESLIN SEED EXCL. DURUM WHEAT,107.73,0.0,0.12,0.0
3,100199,OTHER WHEAT AND MESLIN,46941.02,0.013,94.27,0.0003
4,100210,RYE -SEED,3.16,0.0,0.0,0.0


Linear Combination Reranker
This is the default re-ranker used by LanceDB. It combines the results of semantic and full-text search using a linear combination of the scores. The weights for the linear combination can be specified. It defaults to 0.7, i.e, 70% weight for semantic search and 30% weight for full-text search.

In [None]:
!pip install tantivy



In [None]:
import lancedb
from lancedb.embeddings import EmbeddingFunctionRegistry
from lancedb.rerankers import LinearCombinationReranker

db = lancedb.connect("/tmp/aa")
registry = EmbeddingFunctionRegistry.get_instance()

func = registry.get("sentence-transformers").create(device="cpu")


class Words(LanceModel):
    HSCode: int = func.SourceField()
    Year_2022_2023: str = func.SourceField()
    Year_2023_2024: str = func.SourceField()
    Commodity: str = func.SourceField()
    vector: Vector(func.ndims()) = func.VectorField()


table = db.create_table("eximhs", schema=Words, mode="overwrite")
table.add(data=df)

# Create an FTS index with replace=True to overwrite if it already exists
# we are doing FTS on commodity
table.create_fts_index("Commodity")

# use can other rerankers such as colbert,coher etc
# you can  check out blog for more https://blog.lancedb.com/hybrid-search-and-custom-reranking-with-lancedb-4c10a6a3447e/
reranker = LinearCombinationReranker(
    weight=0.3
)  # Weight = 0 means pure Text Search (BM-25), 1 means pure Semantic (Vector) Search

# reranker = LinearCombinationReranker(weight=0.3) # Use 0.3 as the weight for vector search
# results = table.search("rebel", query_type="hybrid").rerank(reranker=reranker).to_pandas()


query = "what is HS code of sugar ?"
# Perform a hybrid search
lance_reranker_op = (
    table.search(query, query_type="hybrid")
    .rerank(reranker=reranker)
    .limit(10)
    .to_pandas()
)

lance_reranker_op

Unnamed: 0,HSCode,Year_2022_2023,Year_2023_2024,Commodity,vector,_relevance_score
0,170114,493967.01,38651.73,OTHER CANE SUGAR:,"[-0.09651693, -0.054201603, -0.08507634, 0.044...",0.943882
1,17,2725578.38,206998.8,SUGARS AND SUGAR CONFECTIONERY.,"[-0.044817165, -0.061849847, -0.005438532, 0.0...",0.87149
2,17,2725578.38,206998.8,SUGARS AND SUGAR CONFECTIONERY.,"[-0.044817165, -0.061849847, -0.005438532, 0.0...",0.87149
3,170290,41657.79,3899.81,"OTHER, INCLUDING INVERT SUGAR AND OTHER SUGAR ...","[-0.06565988, -0.09819972, -0.029922914, 0.074...",0.860843
4,170220,9.14,0.42,MAPLE SUGAR AND MAPLE SYRUP,"[-0.055785332, -0.052705675, -0.028172577, 0.0...",0.728385
5,170113,56381.7,4173.9,CANE SUGAR SPECIFIED IN SUBHEADING NOTE 2 TO T...,"[-0.075881355, 0.009683033, -0.027468659, -0.0...",0.560491
6,170410,11547.24,1150.48,CHEWING GUM W/N SUGAR COATED,"[-0.082156956, -0.010985073, -0.010356254, 0.0...",0.532612
7,170260,11.15,0.03,"OTHER FRUCTOSE AND FRUCTOSE SYRUP, CONTAINING ...","[-0.011077197, -0.06980431, -0.07643254, 0.056...",0.078796
8,170219,252.04,2.67,OTHER LACTOSE AND LACTOSE SYRUP,"[0.021483433, -0.12754624, -0.0002809844, 0.04...",0.04903
9,81110,671.91,689.66,"STRAWBERS,UNCOOKD/COOKD BY STEMNG/BOLNG IN WAT...","[-0.041225344, -0.051881045, 0.008875609, 0.04...",0.034394


In [None]:
# As we can see we are getting the ouput from our vectordb

In [None]:
# now we are removing both columsn bcz its not importatn fro next steps
df = lance_reranker_op.drop(columns=["_relevance_score", "vector"])

# The cleaned DataFrame
result_from_lancedb = df

In [None]:
result_from_lancedb

Unnamed: 0,HSCode,Year_2022_2023,Year_2023_2024,Commodity
0,170114,493967.01,38651.73,OTHER CANE SUGAR:
1,17,2725578.38,206998.8,SUGARS AND SUGAR CONFECTIONERY.
2,17,2725578.38,206998.8,SUGARS AND SUGAR CONFECTIONERY.
3,170290,41657.79,3899.81,"OTHER, INCLUDING INVERT SUGAR AND OTHER SUGAR ..."
4,170220,9.14,0.42,MAPLE SUGAR AND MAPLE SYRUP
5,170113,56381.7,4173.9,CANE SUGAR SPECIFIED IN SUBHEADING NOTE 2 TO T...
6,170410,11547.24,1150.48,CHEWING GUM W/N SUGAR COATED
7,170260,11.15,0.03,"OTHER FRUCTOSE AND FRUCTOSE SYRUP, CONTAINING ..."
8,170219,252.04,2.67,OTHER LACTOSE AND LACTOSE SYRUP
9,81110,671.91,689.66,"STRAWBERS,UNCOOKD/COOKD BY STEMNG/BOLNG IN WAT..."


In [None]:
lance_reranker_op = lance_reranker_op.copy()

In [None]:
import pandas as pd
from langchain_openai import OpenAI
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI


agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    result_from_lancedb,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True,
    allow_dangerous_code=True,
)


agent.invoke(query)





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThe HS code for sugar is 1701.[0m

[1m> Finished chain.[0m


{'input': 'what is HS code of sugar ?',
 'output': 'The HS code for sugar is 1701.'}