In [10]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wangd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
resolved = pd.read_csv('resolved_queries.csv')
new = pd.read_csv('new_queries.csv')

In [3]:
resolved

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [4]:
new

Unnamed: 0,Variation_Query,Matches_With_Query_ID
0,Unabel to conect to the internet,1
1,Can’t connect to internet,1
2,Intenet not working,1
3,Payment failed while chekout,2
4,Payment did not go through during chckout,2
5,Payment issue at check out,2
6,Application crashes when opening setings,3
7,App crash when going to settings,3
8,Settings cause the app to chrash,3
9,Forgot passwrd and cant reset,4


In [6]:
def preprocess_text(column):
    # Convert to string and lowercase
    column = column.astype(str).str.lower()

    # Remove punctuation
    column = column.apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    column = column.apply(lambda text: " ".join([word for word in text.split() if word not in stop_words]))

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    column = column.apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))

    return column

In [11]:
resolved.Pre_Resolved_Query=preprocess_text(resolved.Pre_Resolved_Query)
new.Variation_Query=preprocess_text(new.Variation_Query)

In [15]:
def fuzzy_match(query, choices, scorer = fuzz.token_sort_ratio, threshold=60):
    best_match, score, index = process.extractOne(query, choices)
    if score >= threshold:
        return best_match
    else:
        return None

In [16]:
print("Fuzzy Matching Example:")
for i, query in enumerate(new['Variation_Query'].head(5)):
    match = fuzzy_match(query, resolved['Pre_Resolved_Query'])
    print(f"New Query: {query}\nBest Match: {match}\n")

Fuzzy Matching Example:
New Query: unabel conect internet
Best Match: unable connect internet

New Query: can’t connect internet
Best Match: unable connect internet

New Query: intenet working
Best Match: None

New Query: payment failed chekout
Best Match: payment failed checkout

New Query: payment go chckout
Best Match: payment failed checkout



In [19]:
tfidf = TfidfVectorizer()
tfidf_resolved = tfidf.fit_transform(resolved['Pre_Resolved_Query'])
tfidf_new = tfidf.transform(new['Variation_Query'])

cosine_sim = cosine_similarity(tfidf_new, tfidf_resolved)

In [20]:
# Determine the best matches
def find_best_matches(cosine_sim, resolved, new, threshold=0.5):
    matches = []
    for i, sims in enumerate(cosine_sim):
        best_idx = np.argmax(sims)
        best_score = sims[best_idx]
        if best_score >= threshold:
            matches.append({
                'Variation_Query': new.iloc[i]['Variation_Query'],
                'Resolved_Query': resolved.iloc[best_idx]['Pre_Resolved_Query'],
                'Similarity_Score': best_score
            })
    return pd.DataFrame(matches)

In [22]:
matches_df = find_best_matches(cosine_sim, resolved, new, threshold=0.5)

matches_df.sort_values('Similarity_Score', ascending=False)

Unnamed: 0,Variation_Query,Resolved_Query,Similarity_Score
15,checkout page say payment failed,payment failed checkout,1.0
12,cant upload file server,unable upload file server,0.932706
1,can’t connect internet,unable connect internet,0.903782
16,setting page crash app immediately,app crash opening setting,0.866025
6,app crash going setting,app crash opening setting,0.866025
9,forgotten password unable reset,forgot password unable reset,0.842627
11,unable uplod file server,unable upload file server,0.842627
2,payment failed chekout,payment failed checkout,0.816497
17,password reset link working,forgot password unable reset,0.761551
13,file uploading server working,unable upload file server,0.761551
