### Approximate string matching with FuzzyWuzzy

- FuzzyWuzzy is a python library uses Levenstein Distance to calculate the differences between sequences
- following https://github.com/seatgeek/fuzzywuzzy
- and https://towardsdatascience.com/natural-language-processing-for-fuzzy-string-matching-with-python-6632b7824c49

In [1]:
import db_conn
import pymysql
import pandas as pd
from fuzzywuzzy import fuzz



In [2]:
# ratio compare the entire string similarity in order
fuzz.ratio('mesna', 'mesnex')
# 73 = 73% similar

73

In [3]:
# partial_ratio compares partial string similarity
fuzz.partial_ratio('I love computer science', 'computer programming is what I love the most')

43

In [4]:
# token_sort_ratio : ignores word order
fuzz.token_sort_ratio('I love computer science', 'computer programming is what I love the most')

57

In [5]:
# token_set_ratio : ignores duplicate words
fuzz.token_set_ratio('I love computer science', 'computer programming is what I love the most')

79

In [6]:
# token_set_ratio : ignores duplicate words
fuzz.token_set_ratio('I love computer science', 'computer programming')

57

In [7]:
conn = db_conn.CONN
cursor = conn.cursor(pymysql.cursors.DictCursor)

In [8]:
# get drug list from dictionary table
cursor.execute('SELECT * FROM dict_collapsed_final')
drugs = pd.DataFrame(cursor.fetchall())

24070

In [9]:
drugs.head(10)

Unnamed: 0,cui1,cui1_str,cui2,cui2_str
0,C0000294,mesna,C0000294,mesna
1,C0000294,mesna,C0206008,coenzima m
2,C0000294,mesna,C0206008,coenzym m
3,C0000294,mesna,C0206008,coenzyme m
4,C0000294,mesna,C0206008,reduced com
5,C0000294,mesna,C0678115,uromitexan
6,C0000294,mesna,C0721681,mesnex
7,C0000294,mesna,C0733873,astad 7093
8,C0000294,mesna,C0733874,mistabron
9,C0000294,mesna,C0733874,mistabronco


## cui1 와 cui2 string을 비교하기 = 연습으로

## token_set_ratio

In [10]:
drugs['fuzzy_perc'] = drugs.apply(lambda x: \
                                  fuzz.token_set_ratio(x['cui1_str'], x['cui2_str']),\
                                axis=1)

In [11]:
drugs.head()

Unnamed: 0,cui1,cui1_str,cui2,cui2_str,fuzzy_perc
0,C0000294,mesna,C0000294,mesna,100
1,C0000294,mesna,C0206008,coenzima m,27
2,C0000294,mesna,C0206008,coenzym m,14
3,C0000294,mesna,C0206008,coenzyme m,27
4,C0000294,mesna,C0206008,reduced com,25


In [12]:
# 아예 같은 약물이 아닌, 일부 스펠링이 다른 경우 어느 정도의 약물을 같은 약물로 판단할 수 있을 것인가
drugs[(drugs.fuzzy_perc>70) & (drugs.fuzzy_perc<100)][:10]

Unnamed: 0,cui1,cui1_str,cui2,cui2_str,fuzzy_perc
6,C0000294,mesna,C0721681,mesnex,73
14,C0000294,mesna,C0917779,mesnum,73
32,C0000477,fampridine,C0000477,4 aminopyridine,72
34,C0000477,fampridine,C0000477,dalfampridine,87
35,C0000477,fampridine,C0000477,fampridina,90
37,C0000477,fampridine,C0000477,fampridinum,86
45,C0000608,aminocaproic acid,C0000608,acide aminocaproque,83
48,C0000608,aminocaproic acid,C0000608,aminohexanoic acid,80
68,C0000618,mercaptopurine,C0000618,mercaptopurina,93
70,C0000618,mercaptopurine,C0000618,mercaptopurinum,90


In [13]:
drugs[(drugs.fuzzy_perc<50)][:10]

Unnamed: 0,cui1,cui1_str,cui2,cui2_str,fuzzy_perc
1,C0000294,mesna,C0206008,coenzima m,27
2,C0000294,mesna,C0206008,coenzym m,14
3,C0000294,mesna,C0206008,coenzyme m,27
4,C0000294,mesna,C0206008,reduced com,25
5,C0000294,mesna,C0678115,uromitexan,40
7,C0000294,mesna,C0733873,astad 7093,27
8,C0000294,mesna,C0733874,mistabron,43
9,C0000294,mesna,C0733874,mistabronco,38
10,C0000294,mesna,C0733874,mucofluid,14
12,C0000294,mesna,C0733876,ucb 3983,0


In [14]:
# sample table -> sentence file
sents = []
with open('./new_sample_data_m.txt', 'r') as file:
    for line in file.readlines():
        if(line.find('***')<0):
            sents.append(line.strip('\n'))

In [15]:
sents[:10]

['Adverse event ,  Adverse event : Cardiovascular',
 'Adverse event ,  Cardiovascular , Omapatrilat (n=289) : 20 (7%)',
 'Adverse event ,  Cardiovascular , Lisinopril (n=284) : 34 (12%)',
 'Adverse event ,  Cardiovascular , p : 0·04',
 'Adverse event ,  Adverse event : Cardiovascular , Heart failure',
 'Adverse event ,  Cardiovascular , Omapatrilat (n=289) : 2 (10%)',
 'Adverse event ,  Cardiovascular , Lisinopril (n=284) : 9 (26%)',
 'Adverse event ,  Adverse event : Cardiovascular , Ischaemic',
 'Adverse event ,  Cardiovascular , Omapatrilat (n=289) : 7 (35%)',
 'Adverse event ,  Cardiovascular , Lisinopril (n=284) : 12 (4%)']

In [16]:
similarity_gtl_80 = []
for d in drugs.cui1_str.unique():
    for sen in sents:
        fuzz_ratio = fuzz.token_set_ratio(d, sen)
        if fuzz_ratio >= 80:
            similarity_gtl_80.append({'drug':d, 'sen':sen, 'ratio':fuzz_ratio})

In [17]:
df_sim_gtl_80 = pd.DataFrame(similarity_gtl_80)

In [18]:
df_sim_gtl_80.shape

(792, 3)

In [19]:
df_sim_gtl_80.head()

Unnamed: 0,drug,ratio,sen
0,salbutamol,100,forced expiratory volume in 1 s (L) Measured a...
1,salbutamol,100,forced expiratory volume in 1 s (L) Measured a...
2,salbutamol,100,forced expiratory volume in 1 s percentage pre...
3,salbutamol,100,forced expiratory volume in 1 s percentage pre...
4,salbutamol,100,forced expiratory volume in 1 s percentage pre...


In [21]:
pd.set_option('display.max_colwidth', -1)
df_sim_gtl_80[df_sim_gtl_80.ratio<100]

Unnamed: 0,drug,ratio,sen


In [22]:
drugs.loc[(drugs.cui1_str=='zinc acetate') | (drugs.cui2_str=='zinc acetate')]

Unnamed: 0,cui1,cui1_str,cui2,cui2_str,fuzzy_perc
20289,C0149368,zinc acetate,C0149368,ccris 3346,18
20290,C0149368,zinc acetate,C0149368,zinc acetate,100
20291,C0149368,zinc acetate,C0728966,galzin,44
20292,C0149368,zinc acetate,C2343333,zinx,38
20293,C0149368,zinc acetate,C2962315,peleverus,19


In [23]:
drugs.loc[(drugs.cui1_str=='abiraterone acetate') | (drugs.cui2_str=='abiraterone acetate')]

Unnamed: 0,cui1,cui1_str,cui2,cui2_str,fuzzy_perc
21993,C0754011,abiraterone,C2607886,abiraterone acetate,100


## extract

In [24]:
from fuzzywuzzy import process

In [25]:
choices = ['I love computer science', 'COMPUTER SCIENCE', 'computer programming', 'programming IT']
process.extract('computer science', choices, limit=3, scorer=fuzz.token_set_ratio)

[('I love computer science', 100),
 ('COMPUTER SCIENCE', 100),
 ('computer programming', 67)]

In [26]:
similarity_gtl_80_extract = []
for d in drugs.cui1_str.unique():
    sens = process.extract(d, sents, limit=3, scorer=fuzz.token_set_ratio)
    if len(sens) >=0 :
        similarity_gtl_80_extract.append({d:sens})

In [47]:
pairs = []
for pair in similarity_gtl_80_extract:
    d = list(pair.keys())[0]
    above_80 = list(map(lambda x: {'drug':d, 'sen':x[0], 'perc':x[1]}, filter(lambda x: x[1] > 80, pair[d])))
    if len(above_80)>0:
        pairs.extend(above_80)

In [50]:
pd.DataFrame(pairs).head()

Unnamed: 0,drug,perc,sen
0,salbutamol,100,"forced expiratory volume in 1 s (L) Measured after salbutamol was administered. , beclometasone dipropionate / formoterol fumarate / glycopyrronium bromide (N=687) : 1·11 (0·32)"
1,salbutamol,100,"forced expiratory volume in 1 s (L) Measured after salbutamol was administered. , beclometasone dipropionate / formoterol fumarate (N=680) : 1·10 (0·33)"
2,salbutamol,100,"forced expiratory volume in 1 s percentage predicted Measured after salbutamol was administered. , beclometasone dipropionate / formoterol fumarate / glycopyrronium bromide (N=687) : 36·9 (8·4)"
3,azathioprine,100,"Reduction in disease activity (SASSAD) Adjusted for minimisation variables , Azathioprine (n=41) : 12·0"
4,azathioprine,100,"Reduction in % body area involved Adjusted for minimisation variables , Azathioprine (n=41) : 25·8"
