In [52]:
import pandas as pd
df = pd.read_csv('condo.csv')
df['PropName2'] = df['PropName'].apply(lambda x: '_'.join(x.split()))
text = df[['PropName']].apply(' '.join, axis=1).tolist()

with open('name_address.txt', 'w+') as f:
    for line in text:
        f.write(line+'\n')

In [108]:
df['Address'].apply(lambda x: x.split(',')[-1])

0             Penang
1           Selangor
2       Kuala Lumpur
3       Kuala Lumpur
4             Penang
           ...      
849           Penang
850           Penang
851          Sarawak
852            Sabah
853         Selangor
Name: Address, Length: 854, dtype: object

In [55]:
import fasttext

word_vec = fasttext.train_unsupervised('name_address.txt', model='skipgram', dim=100, ws=1, minn=5, maxn=10, wordNgrams=1, thread=6)
word_vec.save_model("model_prop_vec.bin")

In [56]:
word_vec.get_nearest_neighbors('cyber')

[(0.3540104925632477, 'Putra'),
 (0.2650090456008911, '(Putra'),
 (0.23735281825065613, 'Seroja'),
 (0.2106447070837021, '(Bukit'),
 (0.17398576438426971, '(Seri'),
 (0.16797855496406555, 'Block'),
 (0.14284098148345947, 'Casa'),
 (0.13618676364421844, '</s>'),
 (0.1289900243282318, 'Orkid'),
 (0.12857921421527863, 'Teratai')]

In [57]:
import difflib

In [101]:
word = 'Avant'
difflib.get_close_matches(word.lower(), df['PropName'].str.lower(), n=1, cutoff=0.5)[0].title()

'Avant Court'

In [89]:
df['PropName'].apply(lambda x: levenshtein_ratio_and_distance(word.lower(), x.lower(), ratio_calc=True)).idxmax()

122

In [90]:
df['PropName'].iloc[122]

'Cahaya Permai'

In [62]:
import numpy as np
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return "The strings are {} edits away".format(distance[row][col])