Task 1

In [1]:
import pandas as pd
import re

resolved_queries = pd.read_csv('resolved_queries.csv')
new_queries = pd.read_csv('new_queries.csv')

In [2]:
resolved_queries.head()

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [3]:
new_queries.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID
0,Unabel to conect to the internet,1
1,Can’t connect to internet,1
2,Intenet not working,1
3,Payment failed while chekout,2
4,Payment did not go through during chckout,2


In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

resolved_queries['Processed_Query'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess_text)
new_queries['Processed_Query'] = new_queries['Variation_Query'].apply(preprocess_text)

In [5]:
resolved_queries.head()

Unnamed: 0,Query_ID,Pre_Resolved_Query,Processed_Query
0,1,Unable to connect to the internet,unable to connect to the internet
1,2,Payment failed during checkout,payment failed during checkout
2,3,App crashes when opening settings,app crashes when opening settings
3,4,Forgot password and unable to reset,forgot password and unable to reset
4,5,Unable to upload files to the server,unable to upload files to the server


In [6]:
new_queries.head()

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Processed_Query
0,Unabel to conect to the internet,1,unabel to conect to the internet
1,Can’t connect to internet,1,cant connect to internet
2,Intenet not working,1,intenet not working
3,Payment failed while chekout,2,payment failed while chekout
4,Payment did not go through during chckout,2,payment did not go through during chckout


In [7]:
from fuzzywuzzy import fuzz

#function to find the best fuzzy match for each unresolved query
def fuzzy_match(resolved_queries, unresolved_query, method='token_set_ratio'):
    best_match = None
    highest_score = 0
    
    for _, row in resolved_queries.iterrows():
        resolved_query = row['Processed_Query']
        
        #calculate fuzzy similarity score based on the method
        if method == 'ratio':
            score = fuzz.ratio(resolved_query, unresolved_query)
        elif method == 'partial_ratio':
            score = fuzz.partial_ratio(resolved_query, unresolved_query)
        else:
            score = fuzz.token_set_ratio(resolved_query, unresolved_query)
        
        if score > highest_score:
            highest_score = score
            best_match = row['Query_ID']
    
    return best_match, highest_score

new_queries['Best_Match_Fuzzy'], new_queries['Fuzzy_Score'] = zip(*new_queries['Processed_Query'].apply(
    lambda x: fuzzy_match(resolved_queries, x)))

new_queries[['Variation_Query', 'Best_Match_Fuzzy', 'Fuzzy_Score']]

Unnamed: 0,Variation_Query,Best_Match_Fuzzy,Fuzzy_Score
0,Unabel to conect to the internet,1,95
1,Can’t connect to internet,1,88
2,Intenet not working,1,49
3,Payment failed while chekout,2,83
4,Payment did not go through during chckout,2,70
5,Payment issue at check out,2,57
6,Application crashes when opening setings,3,88
7,App crash when going to settings,3,86
8,Settings cause the app to chrash,3,65
9,Forgot passwrd and cant reset,4,75


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

all_queries = pd.concat([resolved_queries['Processed_Query'], new_queries['Processed_Query']])

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(all_queries)

resolved_tfidf = tfidf_matrix[:len(resolved_queries)]
unresolved_tfidf = tfidf_matrix[len(resolved_queries):]

cosine_sim = cosine_similarity(unresolved_tfidf, resolved_tfidf)

new_queries['Best_Match_TFIDF'] = cosine_sim.argmax(axis=1) + 1 
new_queries['Cosine_Similarity'] = cosine_sim.max(axis=1)

In [9]:
new_queries[['Variation_Query', 'Best_Match_TFIDF', 'Cosine_Similarity']].head()

Unnamed: 0,Variation_Query,Best_Match_TFIDF,Cosine_Similarity
0,Unabel to conect to the internet,1,0.555973
1,Can’t connect to internet,1,0.691635
2,Intenet not working,1,0.0
3,Payment failed while chekout,2,0.388131
4,Payment did not go through during chckout,2,0.319082


Task 2

In [23]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [24]:
base_names = pd.read_csv('base_names.csv')
name_variations = pd.read_csv('name_variations.csv')

In [25]:
base_names.head()

Unnamed: 0,Base_Name_ID,Base_Name
0,1,John Smith
1,2,Jennifer Brown
2,3,Michael O'Connor
3,4,Maria Garcia
4,5,Robert Lee


In [26]:
name_variations.head()

Unnamed: 0,Variation,Matches_With_Base_Name
0,Thomas King,Thomas King
1,ThomasKing,Thomas King
2,Maria Garcia,Maria Garcia
3,MaryLewis,Mary Lewis
4,Nancy W.,Nancy Wright


In [27]:
base_names_list = base_names['Base_Name'].tolist()
name_variations_list = name_variations['Variation'].tolist()

In [28]:
base_names_list = [clean_name(name) for name in base_names_list]
name_variations_list = [clean_name(name) for name in name_variations_list]

In [29]:
matches = []

In [30]:
for variation_name in name_variations_list:
    best_match, score = process.extractOne(variation_name, base_names_list, scorer=fuzz.token_sort_ratio)
    matches.append((variation_name, best_match, score))

In [31]:
matches_df = pd.DataFrame(matches, columns=['Variation Name', 'Best Matched Base Name', 'Score'])

In [32]:
matches_df

Unnamed: 0,Variation Name,Best Matched Base Name,Score
0,Thomas King,Thomas King,100
1,ThomasKing,Thomas King,57
2,Maria Garcia,Maria Garcia,100
3,MaryLewis,Mary Lewis,53
4,Nancy W.,Nancy Wright,74
...,...,...,...
95,Jennifer- Brown,Jennifer Brown,100
96,Daniel- Scott,Daniel Scott,100
97,David M.,David Martinez,67
98,Paul Allen.,Paul Allen,100
