In [None]:
!pip install spacy scikit-learn textract


Collecting spacy
  Using cached spacy-3.5.0-cp311-cp311-macosx_10_9_x86_64.whl (6.7 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl (9.0 MB)
Collecting textract
  Using cached textract-1.6.5-py3-none-any.whl (23 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl (103 kB)
Collecting thinc<8.2.0,>=8.1.0
  Using cached thinc-8.1.7-cp311-cp311-macosx_10_9_x86_64.whl (763 kB)
Collecting srsly<3.0.0,>=2.4.3
  Using cached srsly-2.4.5-cp311-cp311-macosx_10_9_x86_64.whl (488 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting typer<0.8.0,>=0.3.0
  Using cached typer-0.7.0-py3-none-any.whl (38 kB)
Collecting pathy>=0.10.0
  Using cached pathy-0.10.1-py3-none-any.whl (48 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Using cached pydantic-1.10.5-cp311-cp311-macosx_10_9_x86_64.whl (2.8 MB)
Collecting langcodes<4.0.0,>=3.2.0
  Using cached

[33m  DEPRECATION: lxml is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m  Running setup.py install for lxml ... [?25l-

In [None]:
import os
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import textract

# Load a spaCy model for entity extraction
nlp = spacy.load("en_core_web_sm")


In [None]:
def extract_entities(text):
    doc = spacy.tokens.Doc(nlp.vocab)
    chunks = [text[i:i+100000] for i in range(0, len(text), 100000)]
    for chunk in chunks:
        doc = nlp(chunk)
    entities = [ent.text for ent in doc.ents]
    return entities


In [None]:
def load_data(directory):
    # Initialize lists to store file paths and contents
    file_paths = []
    file_contents = []

    # Loop through all files in the directory
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        # Check if file is Excel or CSV
        if file_name.endswith('.xlsx') or file_name.endswith('.csv'):
            # Load Excel or CSV file as pandas dataframe
            df = pd.read_excel(file_path) if file_name.endswith('.xlsx') else pd.read_csv(file_path)
            # Convert dataframe to text
            text = df.to_string()
            file_paths.append(file_path)
            file_contents.append(text)
        # Check if file is DOCX
        elif file_name.endswith('.docx'):
            # Load DOCX file using textract
            text = textract.process(file_path).decode('utf-8')
            file_paths.append(file_path)
            file_contents.append(text)
        # Check if file is PDF
        elif file_name.endswith('.pdf'):
            # Load PDF file using textract
            text = textract.process(file_path).decode('utf-8')
            file_paths.append(file_path)
            file_contents.append(text)

    return file_paths, file_contents

In [None]:
def semantic_search(query, file_contents, top_n=5):
    # Extract entities from query
    query_entities = extract_entities(query)

    # Define a TfidfVectorizer with a custom tokenizer that extracts entities
    vectorizer = TfidfVectorizer(tokenizer=extract_entities)

    # Vectorize the file contents
    vectorized_contents = vectorizer.fit_transform(file_contents)

    # Vectorize the query
    query_vector = vectorizer.transform([query])

    # Compute cosine similarities between the query and file contents
    similarities = cosine_similarity(query_vector, vectorized_contents).flatten()

    # Get indices of the top matching files
    top_indices = similarities.argsort()[::-1][:top_n]

    # Return the top matching data and their similarities
    top_matches = []
    for i in top_indices:
        top_matches.append((file_contents[i], similarities[i]))
    return top_matches


In [None]:
data_dir = "/Users/smhabibulmursaleenchowdhury/Desktop/Masters Project/Data"

In [None]:
# Load the data
file_paths, file_contents = load_data(data_dir)

# Define a query
query = "Intel Xeon E5-2699 V3 (OEM/Tray)?"



In [None]:
results = semantic_search(query, file_contents)

In [None]:
for data, similarity in results:
    print(f"Data: {data}\nSimilarity: {similarity}\n")