# Import Required Libraries

In [1]:
import pandas as pd

from fuzzywuzzy import fuzz
import re # use regular expression (regex) operations on top of fuzzscore
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from names_matcher import NamesMatcher

# Fuzzy vs Names Matching vs Cosine Similarity

In [2]:
# cosine similarity function

def compare_cosine_similarity(product1, product2):
    names = [product1, product2]
    # create the document term matrix
    count_vectorizer = CountVectorizer(stop_words='english')
    count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(names)
    
    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names_out(), 
                  index=['name1', 'name2'])
    result = cosine_similarity(df, df)
    return result[0][1]

## Positive Sample

In [13]:
# sample 1 - products that can/should be matched
bbw_1 = "BODEGAS VALDUERO 6 AÑOS RESERVA PREMIUM 2015"
comp_1 = ["Ribera Del Duero 6 Años Reserva Premium", "Bodegas Valduero Ribera Del Duero Reserva Premium 6 Anos"]

In [16]:
results_1 = {
    "cosine similarity": [],
    "fuzz.ratio": [],
    "fuzz.token_set_ratio": [],
    "names matcher": []
}

for pdt in comp_1:
    results_1["cosine similarity"].append(round(compare_cosine_similarity(bbw_1, pdt),2))
    results_1["fuzz.ratio"].append(fuzz.ratio(bbw_1.lower(), pdt.lower())/100)
    results_1["fuzz.token_set_ratio"].append(fuzz.token_set_ratio(bbw_1, pdt)/100)
    results_1["names matcher"].append(NamesMatcher()([bbw_1], [pdt])[1][0])

In [17]:
results_1

{'cosine similarity': [0.5, 0.58],
 'fuzz.ratio': [0.8, 0.72],
 'fuzz.token_set_ratio': [0.74, 0.88],
 'names matcher': [1.0, 1.0]}

In [18]:
df_1 = pd.DataFrame(results_1, index=[comp_1[0],comp_1[1]])
df_1

Unnamed: 0,cosine similarity,fuzz.ratio,fuzz.token_set_ratio,names matcher
Ribera Del Duero 6 Años Reserva Premium,0.5,0.8,0.74,1.0
Bodegas Valduero Ribera Del Duero Reserva Premium 6 Anos,0.58,0.72,0.88,1.0


## Negative Sample

In [33]:
# sample 2 - products that are NOT supposed to be matched - with extra tests this time
bbw_2 = "BATASIOLO MOSCATO SPUNANTE NV"
comp_2 = ["Prunotto Moscato 2020",
          "BATASIOLO MOSCATO D’ASTI DOCG BOSC DLA REI 2019",
          "BROWN BROTHERS MOSCATO",
          "Batasiolo Bosc D'la Rei Moscato d'Asti"
         ]

In [34]:
results_2 = {
    "cosine similarity": [],
    "fuzz.ratio": [],
    "fuzz.token_set_ratio": [],
    "names matcher": []
}

for pdt in comp_2:
    results_2["cosine similarity"].append(round(compare_cosine_similarity(bbw_2, pdt),2))
    results_2["fuzz.ratio"].append(fuzz.ratio(bbw_2.lower(), pdt.lower())/100)
    results_2["fuzz.token_set_ratio"].append(fuzz.token_set_ratio(bbw_2, pdt)/100)
    results_2["names matcher"].append(round(NamesMatcher()([bbw_2], [pdt])[1][0],2))

In [35]:
results_2

{'cosine similarity': [0.29, 0.35, 0.29, 0.41],
 'fuzz.ratio': [0.44, 0.58, 0.43, 0.6],
 'fuzz.token_set_ratio': [0.52, 0.74, 0.51, 0.74],
 'names matcher': [0.85, 0.83, 0.85, 0.9]}

In [36]:
df_2 = pd.DataFrame(results_2, index=[comp_2[0],comp_2[1],comp_2[2],comp_2[3]])
df_2

Unnamed: 0,cosine similarity,fuzz.ratio,fuzz.token_set_ratio,names matcher
Prunotto Moscato 2020,0.29,0.44,0.52,0.85
BATASIOLO MOSCATO D’ASTI DOCG BOSC DLA REI 2019,0.35,0.58,0.74,0.83
BROWN BROTHERS MOSCATO,0.29,0.43,0.51,0.85
Batasiolo Bosc D'la Rei Moscato d'Asti,0.41,0.6,0.74,0.9


## Conclusion
- names matcher seems to be very lenient and inaccurate when comparing products that are not supposed to be matched
- on the other hand, cosine similarity score seems to be compressed. relatively small spread between the scores of positive vs negative samples
    - quite unforgiving for the positve sample
- token set ratio is also quite lenient for negative samples as it is a more flexible approach than purely using Levenshtein distance

## Hence, we end up using fuzz.ratio() as our main names matcher.
- it performs decently well for positive and negative samples
- note: we compared more products than in these 2 samples (i.e. we did not use ONLY these 2 samples to pick the method)

## More about the algorithms used
- fuzz.ratio: using Levenshtein distance (number of single character insertions, deletions or substitutions required to change one string into another)
- cosine similarity: n-gram matching
- names matching: similarity metric is inspired by max(ratio, token_set_ratio) by the fuzzywuzzy library

# Example Use Case: Matching BBW Products with BBW Orders

In [30]:
# === get product & order data === #
products_df = pd.read_csv('../datasets/products.csv')
products_df = products_df[['product_name','shopify_product_id','product_type']]

products_df["product_name"].value_counts() # we see that there are duplicates
products_df = products_df.drop_duplicates(subset=['product_name'])
print('Products df shape: ' + str(products_df.shape))

orders_df = pd.read_csv('../datasets/cleaned_orders.csv')
print('Orders df shape: ' + str(orders_df.shape))

# === merge product & order data to find exact matches of product name === #
merged = orders_df.merge(products_df, left_on='item_id', right_on="product_name", how="left")
print('Merged df shape: ' + str(merged.shape))

Products df shape: (2355, 3)
Orders df shape: (19467, 13)
Merged df shape: (19467, 16)


In [31]:
# merged_cleaned: contains all the orders that have exact matches to product name from product data
merged_cleaned = merged.dropna(subset=['shopify_product_id'])
print('merged_cleaned df shape: ' + str(merged_cleaned.shape))

# dropped: contains all the orders that don't have exact matches
dropped = merged[merged['shopify_product_id'].isna()]
print('dropped df shape: ' + str(dropped.shape))

# === to check which order items have been dropped === #
# values = dropped['item_id'].value_counts()
# values.to_csv('datasets/items_dropped.csv')

merged_cleaned df shape: (10444, 16)
dropped df shape: (9023, 16)


In [32]:
# products dictionary, key: product_name, values: { shopify_product_id, product_type } 
products_dict = products_df.set_index('product_name').to_dict('index') 

In [33]:
copied_dropped = dropped.copy() # orders with no exact product name matches

for index, row in dropped.iterrows():
    # preliminary checks to reduce comp power -> check if substring inside pdt dict keys
    volume_pattern = r'(- \d+\s*(ml|ML|l|L))'  # Match digits followed by "ml,", "ML", "L," or "l"
    year_pattern = r'(\b\d{4}\b)'  # Match year/vintage
    # Remove volume & year patterns from the string
    string = re.sub(volume_pattern, '', row['item_id'], flags=re.IGNORECASE) # case-insensitive
    string = re.sub(year_pattern, '', string)
    res = [key for key in products_dict.keys() if string in key]

    fuzzthreshold = 70 # threshold for fuzzy score - currently an arbitrary number

    if len(res) == 0: # if the new string gives no matches, do fuzzy score with pdt dict
        max_score = 0
        best_match = ""
        for key in products_dict.keys():
            if fuzz.ratio(row['item_id'], key) > max_score and max_score >= fuzzthreshold:
                best_match = key
                max_score = fuzz.ratio(row['item_id'], key)

    if len(res) >= 1:
        scores = []
        for pdt in res: # find similarity score based on ORIGINAL pdt name from orders data
            scores.append(fuzz.ratio(row['item_id'], pdt))
        max_score = max(scores) # find max score - best match
        best_match = res[scores.index(max_score)]

    if max_score >= fuzzthreshold:
        copied_dropped.at[index, 'product_name'] = best_match
        copied_dropped.at[index, 'shopify_product_id'] = products_dict[best_match]['shopify_product_id']
        copied_dropped.at[index, 'product_type'] = products_dict[best_match]['product_type']

In [34]:
# merged_cleaned2: contains orders that have good-enough product name matches
merged_cleaned2 = copied_dropped.dropna(subset=['shopify_product_id'])
print('merged_cleaned2 df shape: ' + str(merged_cleaned2.shape))

# final: all matched products from 1st & 2nd round of matching
final = pd.concat([merged_cleaned, merged_cleaned2], ignore_index=True)
print('final df shape: ' + str(final.shape))

merged_cleaned2 df shape: (3744, 16)
final df shape: (14188, 16)


In [36]:
final.to_csv('../datasets/matched_orders.csv')