In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('data_2023-07-09 10_45_27 AM.csv')
data.head()

Unnamed: 0,ROW_ID,MODULE,LEVEL2,NAME,DESCRIPTION,KEYWORDS,ORACLE_MODULE,QUERY_PATH
0,1,Finance,Accounts Payable,Account Segment By Modified Date Report,The report generates COA,COA;account segment;modified date report,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...
1,2,Finance,Accounts Payable,ACH Formats,ACH payment formats for both CCD and PPD,ACH;CCD;PPD;payment,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...
2,3,Finance,Accounts Payable,AP Aging Summary Report,Report is developed to extract the AP aging de...,AP;date;debit balance;supplier,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...
3,4,ERP,Other,Approval Hierarchy Report,Approval Hierarchy Report,approval;hierarchy,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...
4,5,ERP,Other,ESS Diagnostics Dashboard,ESS Diagnostics Dashboard,ESS;diagnostics;dashboard,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...


In [3]:
import spacy

In [4]:
# creating a lemmatizer object
nlp = spacy.load('en_core_web_sm')
    
# converting stop words to a set for faster processing
stopwords = spacy.lang.en.stop_words.STOP_WORDS
new_stopwords = set(stopwords.copy())

def keyword_cleaner(line):
    
    line = line.lower()
    line_list = line.split(";")
    new_line_list = [i for i in line_list if i not in new_stopwords]
    
    return " ".join(new_line_list)

In [5]:
data['NEW_KEYWORDS'] = data['KEYWORDS'].apply(keyword_cleaner)

In [6]:
data.head()

Unnamed: 0,ROW_ID,MODULE,LEVEL2,NAME,DESCRIPTION,KEYWORDS,ORACLE_MODULE,QUERY_PATH,NEW_KEYWORDS
0,1,Finance,Accounts Payable,Account Segment By Modified Date Report,The report generates COA,COA;account segment;modified date report,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...,coa account segment modified date report
1,2,Finance,Accounts Payable,ACH Formats,ACH payment formats for both CCD and PPD,ACH;CCD;PPD;payment,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...,ach ccd ppd payment
2,3,Finance,Accounts Payable,AP Aging Summary Report,Report is developed to extract the AP aging de...,AP;date;debit balance;supplier,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...,ap date debit balance supplier
3,4,ERP,Other,Approval Hierarchy Report,Approval Hierarchy Report,approval;hierarchy,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...,approval hierarchy
4,5,ERP,Other,ESS Diagnostics Dashboard,ESS Diagnostics Dashboard,ESS;diagnostics;dashboard,Oracle R13,https://pwc.sharepoint.com/sites/US-ADV-Oracle...,ess diagnostics dashboard


In [7]:
'''import nltk
nltk.download('punkt')'''

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jvkch\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Assuming you have loaded the data into a pandas DataFrame called 'data'
# Replace 'data.csv' with the actual path if you are reading from a CSV file.
# data = pd.read_csv('data.csv')

# Tokenize the text into individual words
data['TOKENIZED_KEYWORDS'] = data['NEW_KEYWORDS'].apply(word_tokenize)

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=data['TOKENIZED_KEYWORDS'], vector_size=100, window=5, min_count=1, workers=4)

# Save the trained Word2Vec model
word2vec_model.save('word2vec_model_1.bin')

In [10]:
from gensim.models import KeyedVectors

word_embeddings_model = KeyedVectors.load('word2vec_model_1.bin')

def document_embedding(keywords):
    embeddings = [word_embeddings_model.wv[word] for word in keywords if word in word_embeddings_model.wv]
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return None
    
def vectorize_query(query):
    new_query = query.lower().split()
    embeddings = [word_embeddings_model.wv[word] for word in new_query if word in word_embeddings_model.wv]
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return None

In [11]:
# Calculate document embeddings and add them to the DataFrame
data['DOCUMENT_EMBEDDING'] = data['TOKENIZED_KEYWORDS'].apply(document_embedding)

# Prepare data for nearest neighbor search
X = np.vstack(data['DOCUMENT_EMBEDDING'].dropna().to_numpy())
document_indices = data['DOCUMENT_EMBEDDING'].dropna().index

In [28]:
# Example user query
user_query = "get ppd ess report"

# Vectorize the user query
user_query_vector = vectorize_query(user_query)

In [15]:
from sklearn.neighbors import NearestNeighbors

# Build the approximate nearest neighbor search index
n_neighbors = 5  # You can adjust this parameter based on the number of desired neighbors.
ann_index = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(X)

In [29]:
# Find the most similar document to the user query
distances, indices = ann_index.kneighbors([user_query_vector])
most_similar_document_id = data.dropna().iloc[indices[0][0]]['ROW_ID']

# Output the most similar document_id
print("The most similar document_id to the user query is:", most_similar_document_id)

The most similar document_id to the user query is: 2
