In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

# Load the datasets (adjust paths as needed)
purchase_data = pd.read_csv('./cleaned_datasets/final_adjusted_faulted_purchase_records.csv')
supplier_data = pd.read_csv('./cleaned_datasets/corrected_supplier_certifications.csv')

# Display first few rows
purchase_data.head()


Unnamed: 0,PO_NUM,ITEM_NAME,SUPPLIER_NAME,SUPPLIER_CODE,ORDERED_QUANTITY,PRICE,PO_VALUE,DOWNPAYMENT_DATE,DELIVERY_DATE,PART_DESCRIPTION,FAULTED_PARTS
0,100000,robotics arm,Pepperl+Fuchs Factory Automation Pvt. Ltd,VD423416,207,4087.92,846199.44,2023-11-03,2023-11-21,standard robotics arm,0
1,100001,hydraulic actuator,PHOENIX CONTACT INDIA PVT LTD,VD454846,36,151.11,5439.96,2023-05-18,2023-07-11,linear hydraulic actuator,0
2,100002,frequency inverter,ELECTRICAL CONTROL SYSTEMS PVT. LTD.,VD401852,95,4495.88,427108.6,2023-10-20,2024-01-28,high-performance frequency inverter,0
3,100003,servo amplifier,EVOKE GLOBAL,VD773282,20,1857.38,37147.6,2023-07-13,2023-08-04,compact servo amplifier,0
4,100004,DC motor,KUDAMM CORPORATION,VD157319,54,2336.32,126161.28,2023-01-03,2023-01-14,5V DC motor,5


In [8]:
# Jaccard similarity function
def jaccard_similarity(query, document):
    query_set = set(query.lower().split())
    document_set = set(document.lower().split())
    intersection = query_set.intersection(document_set)
    union = query_set.union(document_set)
    return len(intersection) / len(union)

# Cosine similarity using CountVectorizer
def cosine_similarity_func(query, documents):
    vectorizer = CountVectorizer().fit_transform([query] + documents)
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1:]

# Sentence-BERT similarity function
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
def embedding_similarity(query, documents):
    query_embedding = model.encode(query, convert_to_tensor=True)
    doc_embeddings = model.encode(documents, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, doc_embeddings).squeeze()
    return scores.numpy()


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
def search_items(query, technique='jaccard', top_n=5):
    unique_items = purchase_data[['ITEM_NAME', 'PART_DESCRIPTION', 'PRICE', 'DOWNPAYMENT_DATE']].drop_duplicates()
    descriptions = unique_items['PART_DESCRIPTION'].tolist()

    if technique == 'jaccard':
        similarities = [jaccard_similarity(query, desc) for desc in descriptions]
    elif technique == 'cosine':
        similarities = cosine_similarity_func(query, descriptions)
    elif technique == 'embedding':
        similarities = embedding_similarity(query, descriptions)

    unique_items['SIMILARITY'] = similarities
    ranked_items = unique_items.sort_values(by='SIMILARITY', ascending=False).head(top_n)

    result = ranked_items.merge(
        purchase_data[['ITEM_NAME', 'PRICE', 'DOWNPAYMENT_DATE']].groupby('ITEM_NAME').agg(
            avg_price=('PRICE', 'mean'), last_price=('PRICE', 'last')
        ).reset_index(), on='ITEM_NAME', how='left'
    )
    return result[['ITEM_NAME', 'PART_DESCRIPTION', 'SIMILARITY', 'avg_price', 'last_price']]


In [12]:
query = input("Describe the item you want").strip().lower()
technique = input("Select a similarity technique (jaccard, cosine, embedding): ").strip().lower()
result = search_items(query, technique)
print(result)


          ITEM_NAME           PART_DESCRIPTION  SIMILARITY    avg_price  \
0  proximity switch  standard proximity switch    0.816497  2483.950271   
1  proximity switch  standard proximity switch    0.816497  2483.950271   
2  proximity switch  standard proximity switch    0.816497  2483.950271   
3  proximity switch  standard proximity switch    0.816497  2483.950271   
4  proximity switch  standard proximity switch    0.816497  2483.950271   

   last_price  
0      397.07  
1      397.07  
2      397.07  
3      397.07  
4      397.07  
