In [1]:
import time
import re
import pandas as pd
import numpy as np
import jellyfish as jf
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')



In [32]:
df = pd.read_csv('product_matching_synthetic.csv')
df.rename(columns ={'internal_name':'description'}, inplace = True)
df.head()

Unnamed: 0,external_name,description,category_label,match
0,Grey Speck Gran Vinyl Floor Tiles,Update your floor with a peel and stick vinyl ...,Flooring & Tiling Clearance,1
1,Decorative Aggregates Blue Slate - 17kg,Blue Slate chips 30-50mm. This decorative surf...,"Fencing, Decking & Landscaping Deals",1
2,Decorative Aggregates Plum Slate - 17kg,Purple tones give this slate a rich but bright...,"Fencing, Decking & Landscaping Deals",1
3,Dec Aggs Cotswold Buff 10-20mm - 17kg,These creamy buff 10-20mm stones chips will br...,"Fencing, Decking & Landscaping Deals",1
4,Artificial Grass Rubber Tile - 0.3m,This grass topped tile is made from recycled r...,"Fencing, Decking & Landscaping Deals",1


In [33]:
df['description'] = df["description"].map(str)


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519783 entries, 0 to 519782
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   external_name   519783 non-null  object
 1   description     519783 non-null  object
 2   category_label  519695 non-null  object
 3   match           519783 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 15.9+ MB


In [35]:
def matching_numbers(external_name, description):

    external_numbers = set(re.findall(r'[0-9]+', external_name))
    internal_numbers = set(re.findall(r'[0-9]+', description))    
    union = external_numbers.union(internal_numbers)
    intersection = external_numbers.intersection(internal_numbers)

    if len(external_numbers)==0 and len(internal_numbers) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))

In [36]:
def engineer_features(df):

    df['description'] = df['description'].str.lower()
    df['external_name'] = df['external_name'].str.lower()
    
    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['external_name'], 
                                      x['description']), axis=1)

    df['hamming_distance'] = df.apply(
    lambda x: jf.hamming_distance(x['external_name'], 
                                  x['description']), axis=1)

    df['jaro_similarity'] = df.apply(
    lambda x: jf.jaro_similarity(x['external_name'], 
                                  x['description']), axis=1)

    df['match_rating_comparison'] = df.apply(
    lambda x: jf.match_rating_comparison(x['external_name'], 
                                         x['description']), axis=1).fillna(0).astype(int)

    df['ratio'] = df.apply(
    lambda x: fuzz.ratio(x['external_name'], 
                         x['description']), axis=1)


    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df

In [37]:
df = engineer_features(df)


In [38]:
df[df.columns[1:]].corr()['match'][:].sort_values(ascending=False)


match                      1.000000
jaro_similarity            0.129769
ratio                      0.127881
match_rating_comparison    0.124974
levenshtein_distance      -0.010700
hamming_distance          -0.010789
Name: match, dtype: float64

In [39]:
X = df[['levenshtein_distance', 'hamming_distance',
       'jaro_similarity','match_rating_comparison','ratio']].values
y = df['match'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [40]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

classifiers = {
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "MLP": MLPClassifier()
}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision',
                                   'recall','f1','roc','run_time','tp','fp',
                                   'tn','fn'])

for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results.head()

Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,XGBClassifier,0.879296,0.120704,0.855841,0.322898,0.46889,0.656075,3.74,85870,933,5539,11615
1,DecisionTreeClassifier,0.929509,0.070491,0.760913,0.835257,0.796354,0.891696,0.05,82301,4502,14328,2826
2,RandomForestClassifier,0.958521,0.041479,0.916894,0.823248,0.867551,0.904251,1.44,85523,1280,14122,3032
3,AdaBoostClassifier,0.844253,0.155747,0.759849,0.08208,0.148156,0.538477,0.23,86358,445,1408,15746
4,MLP,0.845763,0.154237,0.7118,0.109712,0.19012,0.550467,5.12,86041,762,1882,15272


In [None]:
n_estimators = [50,100,150,200]
criterion=['gini', 'entropy', 'log_loss']
max_depth = [5, 10, 20]
max_features=['sqrt', 'log2']


start = time.perf_counter()

param_grid = dict(
                n_estimators=n_estimators,
                criterion=criterion,
                max_depth=max_depth,
                max_features=max_features,
                n_jobs=[6]
)

model = RandomForestClassifier(random_state=0)

grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='roc_auc'
                           )

print('Running GridSearchCV...')
best_model = grid_search.fit(X_train, y_train)
best_score = round(best_model.score(X_test, y_test), 4)
best_params = best_model.best_params_

print('Score:', best_score)
print('Optimum parameters', best_params)

finish = time.perf_counter()
run_time = (finish - start / 60)
print(f"Completed task in {run_time:0.4f} minutes")

In [41]:
classifier = RandomForestClassifier(n_estimators=100)
model = classifier.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred, labels=[1, 0], 
                            target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       0.92      0.82      0.87     17154
   not match       0.97      0.99      0.98     86803

    accuracy                           0.96    103957
   macro avg       0.94      0.90      0.92    103957
weighted avg       0.96      0.96      0.96    103957



In [43]:
results = pd.DataFrame(data={'predictions': y_pred, 'actual': y_test})
results['result'] = np.where(results['predictions']==results['actual'], 1, 0)
results.tail(20)

Unnamed: 0,predictions,actual,result
103937,0,0,1
103938,1,1,1
103939,0,0,1
103940,0,0,1
103941,0,0,1
103942,0,1,0
103943,0,0,1
103944,0,0,1
103945,0,0,1
103946,0,0,1


In [44]:
def get_closest_matches(external_name):

    unique_descriptions = df['description'].unique().tolist()
    closest_matches = process.extract(external_name, 
                  unique_descriptions, 
                  scorer=fuzz.token_set_ratio)

    return closest_matches

In [45]:
def prepare_data(external_name):

    closest_matches = get_closest_matches(external_name)

    df = pd.DataFrame(columns=['external_name', 'description'])

    for match in closest_matches:
        row = {'external_name': external_name, 'description': match[0]}
        df = df.append(row, ignore_index=True)

    return df

In [46]:
closest_data = prepare_data("Homebase Top Soil - 25L")
closest_data.head()

Unnamed: 0,external_name,description
0,Homebase Top Soil - 25L,"for lawn preparation and general planting, the..."
1,Homebase Top Soil - 25L,create a natural and contemporary look in your...
2,Homebase Top Soil - 25L,this soap dispenser features a white plastic c...
3,Homebase Top Soil - 25L,for creating new beds and borders or simply im...
4,Homebase Top Soil - 25L,every inch industrial. franklin combines a mix...


In [47]:
data = engineer_features(closest_data)
data = data[['levenshtein_distance', 'hamming_distance',
       'jaro_similarity','match_rating_comparison','ratio']]

In [48]:
y_pred = model.predict_proba(data)[:,1]


In [49]:
data = data.assign(prediction=y_pred)
data = data.merge(closest_data)
data[['external_name','description','prediction']].head()

Unnamed: 0,external_name,description,prediction
0,homebase top soil - 25l,"for lawn preparation and general planting, the...",0.98
1,homebase top soil - 25l,create a natural and contemporary look in your...,0.1
2,homebase top soil - 25l,this soap dispenser features a white plastic c...,0.06
3,homebase top soil - 25l,for creating new beds and borders or simply im...,0.0
4,homebase top soil - 25l,every inch industrial. franklin combines a mix...,0.21


In [50]:
import pickle
pickle.dump(model, open('best_model', 'wb'))

In [51]:
loaded_model = pickle.load(open('best_model', 'rb'))
loaded_model = loaded_model.fit(X_train, y_train)
y_pred = loaded_model.predict(X_test)

In [52]:
print(classification_report(y_test, y_pred, labels=[1, 0], target_names=['match', 'not match']))

              precision    recall  f1-score   support

       match       0.92      0.82      0.87     17154
   not match       0.97      0.98      0.98     86803

    accuracy                           0.96    103957
   macro avg       0.94      0.90      0.92    103957
weighted avg       0.96      0.96      0.96    103957



In [53]:
results = pd.DataFrame(data={'predictions': y_pred, 'actual': y_test})
results['result'] = np.where(results['predictions']==results['actual'], 1, 0)
results.head(20)

Unnamed: 0,predictions,actual,result
0,0,0,1
1,1,1,1
2,0,0,1
3,1,1,1
4,0,0,1
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,0,1


In [None]:
data=pd.read_csv('Add your csv file path')
data1=data['product']
result=[]
for i in data1:
    closest_data = prepare_data(i)
    data = engineer_features(closest_data)
    data = data[['levenshtein_distance', 'hamming_distance','jaro_similarity','match_rating_comparison','ratio']]
    y_pred = model.predict_proba(data)[:,1]
    if (max(y_pred)) >= 0.5:
        data = data.assign(prediction=y_pred)
        data = data.merge(closest_data)
        res=data[['external_name','description','prediction']]
        result.append(res)

final_match = pd.concat(result, ignore_index=True, sort=False)
final_match
