# Importing Libaries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Data Preview

In [2]:
# Preview Data
duke_hr = pd.read_csv('../data/sample_duke_hr_vanilla.csv', index_col=0)
duke_hr

Unnamed: 0,query_id,query_str,ans_id,ans_str
0,1,What is the total number of work hours per week?,1,"The working hours is 42 hours per week, exclud..."
1,2,How long is the official working hours?,2,The official working hours for Monday to Thurs...
2,3,Are there any other things to take note of reg...,3,"Depending on the nature of work, confirmed ful..."
3,4,Can you tell me about the company's policy on ...,4,Employees shall be entitled to all government ...
4,5,Can you tell me how overtime works in this com...,5,Employees are not entitled to payment for over...
5,6,What happens if I fall sick and not able to at...,6,"In the event of illness, an employee must noti..."
6,7,What is the probation & confirmation about?,7,Employees are normally required to serve a pro...
7,8,Can you tell me more about the retirement poli...,8,The retirement age of an employee is currently...
8,9,Can you tell me about the disciplinary procedu...,9,All employees are subject to the disciplinary ...
9,10,Can you tell me about MVC (Monthly Variable Co...,10,This is applicable to Executive & Professional...


# Requesting Data from Honey Kids Asia

In [3]:
# Request for Page Data
short_forms_page = requests.get("https://honeykidsasia.com/ultimate-guide-to-singapore-acronyms/")
short_forms_page

<Response [200]>

In [4]:
# Initiating BeautifulSoup Object
soup = BeautifulSoup(short_forms_page.content, 'html.parser')

# Store the Abbreviations as a Dataset

In [5]:
# Dictionary with Key as the Full Form and Value as the Abbreviation
short_forms = [row.get_text() for row in soup.find_all('td')]
short_forms_dict = {short_forms[i]: short_forms[i-1] for i in range(1, len(short_forms), 2)}
short_forms_dict

{'Anglo-Chinese School': 'ACS',
 'Admissions Exercise for International Students': 'AEIS',
 'Australian International School': 'AIS',
 'Ang Mo Kio': 'AMK',
 'Association of Women for Action and Research': 'AWARE',
 'Ayer Rajah Expressway': 'AYE',
 'Bukit Timah Expressway': 'BKE',
 'Bak Kut Teh': 'BKT',
 'Bukit Timah': 'BT',
 'Built to Order (Flats)': 'BTO',
 'Changi Business Park': 'CBP',
 'Community Club': 'CC',
 'Co-Curricular Activity': 'CCA',
 'Certificate of Entitlement': 'COE',
 'Central Provident Fund': 'CPF',
 'Central Expressway': 'CTE',
 'Development Bank of Singapore': 'DBS',
 'Dover Court International School': 'DCIS',
 'Dulwich College Singapore': 'DCS',
 'Dependent Pass': 'DP',
 'Din Tai Fung': 'DTF',
 'Executive Condominium': 'EC',
 'East Coast Parkway': 'ECP',
 'East Coast Park': 'ECP',
 'Employment Pass': 'EP',
 'Electronic Road Pricing': 'ERP',
 'Foreign Domestic Worker': 'FDW',
 'Gardens by the Bay': 'GBTB',
 'GEMS World Academy (Singapore)': 'GEMS',
 'German Europea

In [6]:
duke_hr.loc[len(duke_hr)] = [len(duke_hr)+1, 'Can you tell me more about the National University of Singapore', len(duke_hr)+1, 'Yes, sure..']

In [7]:
duke_hr

Unnamed: 0,query_id,query_str,ans_id,ans_str
0,1,What is the total number of work hours per week?,1,"The working hours is 42 hours per week, exclud..."
1,2,How long is the official working hours?,2,The official working hours for Monday to Thurs...
2,3,Are there any other things to take note of reg...,3,"Depending on the nature of work, confirmed ful..."
3,4,Can you tell me about the company's policy on ...,4,Employees shall be entitled to all government ...
4,5,Can you tell me how overtime works in this com...,5,Employees are not entitled to payment for over...
5,6,What happens if I fall sick and not able to at...,6,"In the event of illness, an employee must noti..."
6,7,What is the probation & confirmation about?,7,Employees are normally required to serve a pro...
7,8,Can you tell me more about the retirement poli...,8,The retirement age of an employee is currently...
8,9,Can you tell me about the disciplinary procedu...,9,All employees are subject to the disciplinary ...
9,10,Can you tell me about MVC (Monthly Variable Co...,10,This is applicable to Executive & Professional...


# Basic 

In [8]:
last_index= len(duke_hr)

# if multiple same shortforms appear
for index, row in duke_hr.iterrows():
    temp_arr = []
    temp = ''
    for key in short_forms_dict.keys():
        if (key.lower() in row['query_str'].lower()):
            if (row['query_str'] in temp_arr):
                temp = temp.replace(key, short_forms_dict[key])
                print(f'Multiple characters have to be mapped for this row: {row["query_str"]}')
            else:
                temp = row['query_str'].replace(key, short_forms_dict[key])
            temp_arr.append(row['query_str'])
    if (temp != ''):
        duke_hr.loc[last_index] = [last_index + 1, temp, last_index + 1, row['ans_str']]   
        last_index += 1

Multiple characters have to be mapped for this row: Tell me more about Employment Passes / Dependent Passes regarding leaving service.
Multiple characters have to be mapped for this row: Can you tell me more about the National University of Singapore


In [9]:
duke_hr

Unnamed: 0,query_id,query_str,ans_id,ans_str
0,1,What is the total number of work hours per week?,1,"The working hours is 42 hours per week, exclud..."
1,2,How long is the official working hours?,2,The official working hours for Monday to Thurs...
2,3,Are there any other things to take note of reg...,3,"Depending on the nature of work, confirmed ful..."
3,4,Can you tell me about the company's policy on ...,4,Employees shall be entitled to all government ...
4,5,Can you tell me how overtime works in this com...,5,Employees are not entitled to payment for over...
5,6,What happens if I fall sick and not able to at...,6,"In the event of illness, an employee must noti..."
6,7,What is the probation & confirmation about?,7,Employees are normally required to serve a pro...
7,8,Can you tell me more about the retirement poli...,8,The retirement age of an employee is currently...
8,9,Can you tell me about the disciplinary procedu...,9,All employees are subject to the disciplinary ...
9,10,Can you tell me about MVC (Monthly Variable Co...,10,This is applicable to Executive & Professional...


# Testing Ground 

In [10]:
test_strings = ["National University of Singapore ", "national university of singapore", "national uni of sg", "national universiti of singapore", "NationalUniversityofSingapore", "National University of Singaproe", "NUS", "Singapore National University"]

correct_answer = "National University of Singapore"

# test_strings = [input(f'Input the Test String {i+1}') for i in range(5)]
# correct_answer = input('Enter the Value in the Mapped Dictionary')

## The Levenshtein Distance


In [11]:
import numpy as np

def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return "The strings are {} edits away".format(distance[row][col])

In [12]:
# This seems to work
for test_str in test_strings:
    print('=' * 100)
    print(f'Testing {test_str} now..')
    distance = levenshtein_ratio_and_distance(test_str, correct_answer)
    print(distance)
    ratio = levenshtein_ratio_and_distance(test_str, correct_answer,ratio_calc = True)
    print(ratio)

Testing National University of Singapore  now..
The strings are 1 edits away
0.9846153846153847
Testing national university of singapore now..
The strings are 3 edits away
0.90625
Testing national uni of sg now..
The strings are 17 edits away
0.6
Testing national universiti of singapore now..
The strings are 4 edits away
0.875
Testing NationalUniversityofSingapore now..
The strings are 3 edits away
0.9508196721311475
Testing National University of Singaproe now..
The strings are 2 edits away
0.96875
Testing NUS now..
The strings are 29 edits away
0.17142857142857143
Testing Singapore National University now..
The strings are 23 edits away
0.6229508196721312


## Fuzzy Wuzzy 

In [13]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

### Simple Ratio 

In [14]:
# This seems to work
for test_str in test_strings:
    print('=' * 100)
    print(f'Testing {test_str} now..')
    ratio = fuzz.ratio(test_str, correct_answer)
    print(f'The ratio between these two strings is {ratio}')

Testing National University of Singapore  now..
The ratio between these two strings is 98
Testing national university of singapore now..
The ratio between these two strings is 91
Testing national uni of sg now..
The ratio between these two strings is 60
Testing national universiti of singapore now..
The ratio between these two strings is 88
Testing NationalUniversityofSingapore now..
The ratio between these two strings is 95
Testing National University of Singaproe now..
The ratio between these two strings is 97
Testing NUS now..
The ratio between these two strings is 17
Testing Singapore National University now..
The ratio between these two strings is 62


### Partial Ratio 

In [15]:
for test_str in test_strings:
    print('=' * 100)
    print(f'Testing {test_str} now..')
    simple_ratio = fuzz.ratio(test_str, correct_answer)
    print(f'The simple ratio between these two strings is {simple_ratio}')
    partial_ratio = fuzz.partial_ratio(test_str, correct_answer)
    print(f'The partial ratio between these two strings is {partial_ratio}')

Testing National University of Singapore  now..
The simple ratio between these two strings is 98
The partial ratio between these two strings is 100
Testing national university of singapore now..
The simple ratio between these two strings is 91
The partial ratio between these two strings is 91
Testing national uni of sg now..
The simple ratio between these two strings is 60
The partial ratio between these two strings is 61
Testing national universiti of singapore now..
The simple ratio between these two strings is 88
The partial ratio between these two strings is 88
Testing NationalUniversityofSingapore now..
The simple ratio between these two strings is 95
The partial ratio between these two strings is 90
Testing National University of Singaproe now..
The simple ratio between these two strings is 97
The partial ratio between these two strings is 97
Testing NUS now..
The simple ratio between these two strings is 17
The partial ratio between these two strings is 33
Testing Singapore Nati

### Token Sort Ratio

In [16]:
# This seems to perform btr when strings are inverted
for test_str in test_strings:
    print('=' * 100)
    print(f'Testing {test_str} now..')
    simple_ratio = fuzz.ratio(test_str, correct_answer)
    print(f'The simple ratio between these two strings is {simple_ratio}')
    partial_ratio = fuzz.partial_ratio(test_str, correct_answer)
    print(f'The partial ratio between these two strings is {partial_ratio}')
    tokensort_ratio = fuzz.token_sort_ratio(test_str, correct_answer)
    print(f'The token sort ratio between these two strings is {tokensort_ratio}')

Testing National University of Singapore  now..
The simple ratio between these two strings is 98
The partial ratio between these two strings is 100
The token sort ratio between these two strings is 100
Testing national university of singapore now..
The simple ratio between these two strings is 91
The partial ratio between these two strings is 91
The token sort ratio between these two strings is 100
Testing national uni of sg now..
The simple ratio between these two strings is 60
The partial ratio between these two strings is 61
The token sort ratio between these two strings is 72
Testing national universiti of singapore now..
The simple ratio between these two strings is 88
The partial ratio between these two strings is 88
The token sort ratio between these two strings is 97
Testing NationalUniversityofSingapore now..
The simple ratio between these two strings is 95
The partial ratio between these two strings is 90
The token sort ratio between these two strings is 62
Testing National U

### Token Set Ratio 

In [17]:
# This seems to perform btr when strings are inverted
for test_str in test_strings:
    print('=' * 100)
    print(f'Testing {test_str} now..')
    simple_ratio = fuzz.ratio(test_str, correct_answer)
    print(f'The simple ratio between these two strings is {simple_ratio}')
    partial_ratio = fuzz.partial_ratio(test_str, correct_answer)
    print(f'The partial ratio between these two strings is {partial_ratio}')
    tokensort_ratio = fuzz.token_sort_ratio(test_str, correct_answer)
    print(f'The token sort ratio between these two strings is {tokensort_ratio}')
    tokenset_ratio = fuzz.token_set_ratio(test_str, correct_answer)
    print(f'The token set ratio between these two strings is {tokenset_ratio}')

Testing National University of Singapore  now..
The simple ratio between these two strings is 98
The partial ratio between these two strings is 100
The token sort ratio between these two strings is 100
The token set ratio between these two strings is 100
Testing national university of singapore now..
The simple ratio between these two strings is 91
The partial ratio between these two strings is 91
The token sort ratio between these two strings is 100
The token set ratio between these two strings is 100
Testing national uni of sg now..
The simple ratio between these two strings is 60
The partial ratio between these two strings is 61
The token sort ratio between these two strings is 72
The token set ratio between these two strings is 76
Testing national universiti of singapore now..
The simple ratio between these two strings is 88
The partial ratio between these two strings is 88
The token sort ratio between these two strings is 97
The token set ratio between these two strings is 97
Test

### Process

In [18]:
process.extract("National University of Singapore", test_strings, limit=3)

[('National University of Singapore ', 100),
 ('national university of singapore', 100),
 ('national universiti of singapore', 97)]

In [19]:
process.extractOne("National University of Singapore", test_strings)

('National University of Singapore ', 100)

# Implementation 

In [20]:
import itertools
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
short_forms_dict 

{'Anglo-Chinese School': 'ACS',
 'Admissions Exercise for International Students': 'AEIS',
 'Australian International School': 'AIS',
 'Ang Mo Kio': 'AMK',
 'Association of Women for Action and Research': 'AWARE',
 'Ayer Rajah Expressway': 'AYE',
 'Bukit Timah Expressway': 'BKE',
 'Bak Kut Teh': 'BKT',
 'Bukit Timah': 'BT',
 'Built to Order (Flats)': 'BTO',
 'Changi Business Park': 'CBP',
 'Community Club': 'CC',
 'Co-Curricular Activity': 'CCA',
 'Certificate of Entitlement': 'COE',
 'Central Provident Fund': 'CPF',
 'Central Expressway': 'CTE',
 'Development Bank of Singapore': 'DBS',
 'Dover Court International School': 'DCIS',
 'Dulwich College Singapore': 'DCS',
 'Dependent Pass': 'DP',
 'Din Tai Fung': 'DTF',
 'Executive Condominium': 'EC',
 'East Coast Parkway': 'ECP',
 'East Coast Park': 'ECP',
 'Employment Pass': 'EP',
 'Electronic Road Pricing': 'ERP',
 'Foreign Domestic Worker': 'FDW',
 'Gardens by the Bay': 'GBTB',
 'GEMS World Academy (Singapore)': 'GEMS',
 'German Europea

In [22]:
# Idea 1

# Step 1: Get the keys of the dictionary (lowercase)
# Step 2: Tokenize the Words in the Sentence (lowercase)
# Step 3: Compare both lists to find out how many overlaps 
# Step 4: If more than one word overlaps, then detect it
# Step 5: Then check the words that are detected and find their respective indexes in the sentence.
# Step 6: Locate them and extract the string out
# Step 7: Compare the extracted string and the string in the dictionary key with Levenshtein Distance (If metrics > 0.8)
# Step 8: If the extracted string passes the test above, change the dictionary key to the extracted string and map it onto the sentence.

In [23]:
def idea1(sentence):
    paired_words = []
    related_scores = []
    long_forms = short_forms_dict.keys()
    words = sentence.split(' ')
    words = [word for word in words if not word in stopwords.words('english')]
#     dict_count = {key: 0 for key in long_forms}
    for long_form, word in itertools.product(long_forms, words):
        if (word.lower() in long_form.lower()):
            ratio = fuzz.ratio(word, long_form)
            paired_words.append([word, long_form])
            related_scores.append(ratio)
    max_score_index = related_scores.index(max(related_scores))
    text_to_replace = paired_words[max_score_index][0]
    sentence_index = sentence.find(text_to_replace)
    sliced_sentence = sentence[:sentence_index]
    short_form = short_forms_dict[paired_words[max_score_index][1]]
    print(short_form)
    print(sliced_sentence + short_form + sentence[sentence_index-1: ])
#     filter_dict_count = {k: v for k, v in dict_count.items() if v > 0}
#     keys = list(filter_dict_count.keys())
#     counts = list(filter_dict_count.values())
#     max_no = max(counts)
#     highest = [max_no for _ in range(counts.count(max_no))]
#     print(highest) 
#     # First Case (If only one Key) 
#     if (len(filter_dict_count.keys()) == 1):
#         keyword = list(filter_dict_count.keys())[0]
#         print(f'{keyword} has to be replaced into {short_forms_dict[keyword]}.')
#     # Second Case (Unique Key with Maximum Value)
#     elif (len(highest) == 1):
#         position = counts.index(2)
#         print(f'{keys[position]} has to be replaced into {short_forms_dict[keys[position]]}.')
#     # Second Case (Cannot find anyway)
#     elif (len(filter_dict_count.keys()) == 0):
#         return None
    
#     max_key = max(dict_count, key=dict_count.get)
#     print(dict_count)
#     print(max_key)
#     print(short_forms_dict[max_key])

In [24]:
idea1('Explain more about National University of Singapore')

SG
Explain more about National University of SG Singapore


In [None]:
# transform the user inputs into a similar metric