In [4]:
import pandas as pd
import textdistance
import os
from collections import Counter

# Read the Excel file
file_path = 'D:\Proy\Jupyter\DGILb.xlsx'
df = pd.read_excel(file_path)

# Select the two columns you want to compare
column1 = df['DGIL']
column2 = df['CNIS']

# Compute similarity metrics
def compare_columns(col1, col2, metric):
    similarities = []
    for val1, val2 in zip(col1, col2):
        similarities.append(metric(str(val1), str(val2)))
    return similarities

# Use Jaro-Winkler distance from textdistance
jw = textdistance.jaro_winkler.normalized_similarity
jw_similarities = compare_columns(column1, column2, jw)

# Use Levenshtein distance from textdistance
levenshtein = textdistance.levenshtein.normalized_similarity
levenshtein_similarities = compare_columns(column1, column2, levenshtein)

# Use Damerau-Levenshtein distance from textdistance
damerau_levenshtein = textdistance.damerau_levenshtein.normalized_similarity #normalized_similarity is the same as similarity but returns a value between 0 and 1
damerau_levenshtein_similarities = compare_columns(column1, column2, damerau_levenshtein)

# Determine if values are similar and add new columns
similarity_threshold = 0.8
df['jaro_winkler_comparison_result'] = ['similar' if similarity >= similarity_threshold else 'nonsimilar' for similarity in jw_similarities]
df['levenshtein_comparison_result'] = ['similar' if similarity >= similarity_threshold else 'nonsimilar' for similarity in levenshtein_similarities]
df['damerau_levenshtein_comparison_result'] = ['similar' if similarity >= similarity_threshold else 'nonsimilar' for similarity in damerau_levenshtein_similarities]

# Count the results of similar and nonsimilar for each algorithm
jw_result_counts = Counter(df['jaro_winkler_comparison_result'])
lev_result_counts = Counter(df['levenshtein_comparison_result'])
dl_result_counts = Counter(df['damerau_levenshtein_comparison_result'])

print("Jaro-Winkler similar count:", jw_result_counts['similar'])
print("Jaro-Winkler nonsimilar count:", jw_result_counts['nonsimilar'])

print("Levenshtein similar count:", lev_result_counts['similar'])
print("Levenshtein nonsimilar count:", lev_result_counts['nonsimilar'])

print("Damerau-Levenshtein similar count:", dl_result_counts['similar'])
print("Damerau-Levenshtein nonsimilar count:", dl_result_counts['nonsimilar'])

# Save the DataFrame with the new columns to a new Excel file
output_file_path = 'D:\Proy\Jupyter\DGILb_compared.xlsx'
df.to_excel(output_file_path, index=False)


Jaro-Winkler similar count: 5930
Jaro-Winkler nonsimilar count: 2953
Levenshtein similar count: 5591
Levenshtein nonsimilar count: 3292
Damerau-Levenshtein similar count: 5591
Damerau-Levenshtein nonsimilar count: 3292


In [5]:
df

Unnamed: 0,Cons,DGIL,CNIS,jaro_winkler_comparison_result,levenshtein_comparison_result,damerau_levenshtein_comparison_result
0,1,acido acetilsalicilico tableta cada tableta co...,acido acetilsalicilico tableta cada tableta co...,similar,similar,similar
1,2,acido acetilsalicilico tableta soluble o eferv...,acido acetilsalicilico tableta soluble o eferv...,similar,similar,similar
2,3,paracetamol tableta cada tableta contiene para...,paracetamol tableta cada tableta contiene para...,similar,similar,similar
3,4,paracetamol supositorio cada supositorio conti...,paracetamol supositorio cada supositorio conti...,similar,similar,similar
4,5,paracetamol solucion oral cada ml contiene par...,paracetamol solucion oral cada ml contiene par...,similar,similar,similar
...,...,...,...,...,...,...
8878,8879,oxicodona naloxona tableta de liberacion prolo...,oxicodona naloxona tableta de liberacion prolo...,similar,similar,similar
8879,8880,sulpirida tabletas o capsulas cada capsula o t...,sulpirida tabletas o capsulas cada capsula o t...,similar,similar,similar
8880,8881,sulpirida tabletas o capsulas cada capsula o t...,sulpirida tabletas o capsulas cada capsula o t...,similar,similar,similar
8881,8882,alprazolam tableta cada tableta contiene alpra...,alprazolam tableta cada tableta contiene alpra...,similar,similar,similar


In [6]:
os.startfile('D:\Proy\Jupyter\DGILb_compared.xlsx')