In [None]:
import csv
import pandas as pd
import spacy
from spacy import registry
from spacy.tokens import Doc



In [None]:
# Define a new entity label
MY_LABEL = "MY_LABEL"

@spacy.Language.component("my_custom_component")
def my_custom_component(doc):
    for ent in doc.ents:
        if ent.label_ == "ORG":
            ent.label_ = MY_LABEL
    return doc

In [None]:
# Load the Spacy NLP model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define the function to extract entities
def extract_entities(text, entities_to_extract):
    # Process the text with the Spacy pipeline
    doc = nlp(text)

    # Get the entities in the document that match the entities_to_extract
    entities = [ent for ent in doc.ents if ent.label_ in entities_to_extract]

    # Return the entities
    return entities

In [None]:
doc = nlp("Steve Jobs was the CEO of Apple and lives in New York.")
for ent in doc.ents:
    print(ent.text, ent.label_)

Steve Jobs PERSON
Apple ORG
New York GPE


In [None]:
# Define some text inputs to test with
text1 = "Apple is looking at buying U.K. startup for $1 billion"
text2 = "The quick brown fox jumps over the lazy dog"
text3 = "Microsoft releases new version of Windows"

In [None]:
# Define the entity types to extract
entities_to_extract = ["ORG", "MONEY"]

In [None]:
# Call the extract_entities function with different text inputs
entities1 = extract_entities(text1, entities_to_extract)
entities2 = extract_entities(text2, entities_to_extract)
entities3 = extract_entities(text3, entities_to_extract)

In [None]:
# Print the results
print("Entities in text 1:", entities1)
print("Entities in text 2:", entities2)
print("Entities in text 3:", entities3)

Entities in text 1: [Apple, $1 billion]
Entities in text 2: []
Entities in text 3: [Microsoft]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the csv file
df = pd.read_csv("/content/drive/MyDrive/dataset.csv")

print(df)

        RegionName               CountryName        State       City  \
0    South America  United States of America        Texas  Southlake   
1    South America  United States of America        Texas  Southlake   
2    South America  United States of America        Texas  Southlake   
3    South America  United States of America        Texas  Southlake   
4    South America  United States of America        Texas  Southlake   
..             ...                       ...          ...        ...   
395           Asia                     India  Maharashtra     Bombay   
396           Asia                     India  Maharashtra     Bombay   
397           Asia                     India  Maharashtra     Bombay   
398           Asia                     India  Maharashtra     Bombay   
399           Asia                     India  Maharashtra     Bombay   

    PostalCode     WarehouseAddress    WarehouseName     EmployeeName  \
0        26192  2014 Jabberwocky Rd  Southlake Texas     Summe

In [None]:
# Define the entities to extract
entities_to_extract = ["DATE", "PERSON", "ORG", "PRODUCT"]

Apply entity extraction techniques: Use natural language processing (NLP) techniques such as named entity recognition (NER) to extract the relevant entities from the pre-processed data. This involves analyzing the data to identify words or phrases that represent the entities of interest.

In [None]:
# Define a function to extract the entities from a text
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in entities_to_extract:
            entities.append(ent.text)
    return entities


In [None]:
# Example usage
text = "Summer Payne brought this model of Intel Xeon E5-2699 V3 (OEM/Tray) CPU"
entities = extract_entities(text)
print(entities)

['Summer Payne', 'Intel', 'Xeon E5-2699', 'OEM/Tray', 'CPU']


Create a semantic search index: Use the extracted entities to create an index of your Excel data that maps each entity to the corresponding cell or row in the Excel file.

In [None]:
# Create an index of the Excel data
index = {}
for i, row in df.iterrows():
    for col in df.columns:
        text = str(row[col])
        entities = extract_entities(text)
        for entity in entities:
            if entity not in index:
                index[entity] = []
            index[entity].append((i, col))

print(index)

{'26192': [(0, 'PostalCode'), (1, 'PostalCode'), (2, 'PostalCode'), (3, 'PostalCode'), (4, 'PostalCode'), (5, 'PostalCode'), (6, 'PostalCode'), (7, 'PostalCode'), (8, 'PostalCode'), (9, 'PostalCode'), (10, 'PostalCode'), (11, 'PostalCode'), (12, 'PostalCode'), (13, 'PostalCode'), (14, 'PostalCode'), (15, 'PostalCode'), (16, 'PostalCode'), (17, 'PostalCode'), (18, 'PostalCode'), (19, 'PostalCode'), (20, 'PostalCode'), (21, 'PostalCode'), (22, 'PostalCode'), (23, 'PostalCode'), (24, 'PostalCode'), (25, 'PostalCode'), (26, 'PostalCode'), (27, 'PostalCode'), (28, 'PostalCode'), (29, 'PostalCode'), (30, 'PostalCode'), (31, 'PostalCode'), (32, 'PostalCode'), (33, 'PostalCode'), (34, 'PostalCode'), (35, 'PostalCode'), (36, 'PostalCode'), (37, 'PostalCode'), (38, 'PostalCode'), (39, 'PostalCode'), (40, 'PostalCode'), (41, 'PostalCode'), (42, 'PostalCode'), (43, 'PostalCode')], '2014': [(0, 'WarehouseAddress'), (1, 'WarehouseAddress'), (2, 'WarehouseAddress'), (3, 'WarehouseAddress'), (4, 'Ware

Perform a semantic search: Use the index to search for relevant information based on the entities extracted. This involves querying the index to find the cells or rows that contain the relevant information and ranking the results based on relevance.

In [None]:
# Define a function to perform a semantic search
def semantic_search(query):
    entities = extract_entities(query)
    results = set()
    for entity in entities:
        if entity in index:
            results.update(index[entity])
    results = list(results)
    results.sort()
    return [df.iloc[i][col] for i, col in results]


In [None]:
# Example usage
results = semantic_search("Show me items related to Summer Payne's new Intel Xeon E5-2699 V3 (OEM/Tray) product")
print(results)

['Intel Xeon E5-2699 V3 (OEM/Tray)', 'Intel Xeon E5-2697 V3', 'Intel Xeon E5-2698 V3 (OEM/Tray)', 'Intel Xeon E5-2697 V4', 'Intel Xeon E5-2685 V3 (OEM/Tray)', 'Intel Xeon E5-2695 V3 (OEM/Tray)', 'Intel Xeon E5-2697 V2', 'Intel Xeon E5-2695 V4', 'Intel Xeon E5-2695 V2', 'Intel Xeon E5-2643 V2 (OEM/Tray)', 'Intel Xeon E5-2690 (OEM/Tray)', 'Intel Xeon E5-2687W V3', 'Intel Xeon E5-2687W V4', 'Intel Xeon E5-2667 V3 (OEM/Tray)', 'Intel Xeon E5-2690 V4', 'Intel Xeon E5-2690 V3', 'Intel Xeon E5-2470V2', 'Intel Xeon E5-2683 V4', 'Intel Xeon E5-2637 V2 (OEM/Tray)', 'Intel Xeon E5-2683 V4 (OEM/Tray)', 'Intel Xeon E5-2699 V4 (OEM/Tray)', 'Intel Xeon E5-1680 V3 (OEM/Tray)', 'Intel Xeon E5-2643 V4 (OEM/Tray)', 'Intel Xeon E5-2670 V3', 'Intel Xeon E5-2680', 'Intel Xeon E5-2680 V4', 'Intel Xeon E5-2680 V3 (OEM/Tray)', 'Intel Xeon E5-2643 V3 (OEM/Tray)', 'Intel Xeon E5-2660 V4', 'Intel Xeon E5-2660 V3', 'Intel Xeon E5-2660 V3 (OEM/Tray)', 'Intel Xeon E5-2650 V2', 'Intel Xeon E5-2650 V3', 'Intel Xeon E5