In [16]:
import pandas as pd
import numpy as np
import re
import string

In [10]:
from textdistance import Levenshtein

In [11]:
from nltk.tokenize import TweetTokenizer

In [12]:
def clean_tweet(str1):
    word=re.sub(r'^RT[\s]+','',str1)
    word=re.sub(r'\$\w*','',word)
    word=re.sub(r'https?:\/\/.*[\r\n]*','',word)
    word=re.sub(r'#','',word)
    tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    tweet_tokens=tokenizer.tokenize(str1)
    word_clean=[]
    for words in tweet_tokens:
        if words not in string.punctuation:
            word_clean.append(words)
    return word_clean

In [17]:
sentence='@apoorva Language is less about words and more about the meaning behind them. If you spend all your time learning vocabulary and grammar, you will never be able to fluently speak a language because you will have little to talk about. These short stories give you the opportunity to understand big ideas in context.'
tok=clean_tweet(sentence)
tok

['language',
 'is',
 'less',
 'about',
 'words',
 'and',
 'more',
 'about',
 'the',
 'meaning',
 'behind',
 'them',
 'if',
 'you',
 'spend',
 'all',
 'your',
 'time',
 'learning',
 'vocabulary',
 'and',
 'grammar',
 'you',
 'will',
 'never',
 'be',
 'able',
 'to',
 'fluently',
 'speak',
 'a',
 'language',
 'because',
 'you',
 'will',
 'have',
 'little',
 'to',
 'talk',
 'about',
 'these',
 'short',
 'stories',
 'give',
 'you',
 'the',
 'opportunity',
 'to',
 'understand',
 'big',
 'ideas',
 'in',
 'context']

In [25]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [26]:
levenshtein('cherry', 'merry')

2

In [28]:
for i in tok:
    dist=levenshtein('cherry', i)
    print(i,dist)

language 8
is 6
less 5
about 6
words 5
and 6
more 5
about 6
the 4
meaning 7
behind 6
them 4
if 6
you 6
spend 5
all 6
your 5
time 6
learning 7
vocabulary 7
and 6
grammar 7
you 6
will 6
never 5
be 5
able 6
to 6
fluently 6
speak 5
a 6
language 8
because 7
you 6
will 6
have 5
little 6
to 6
talk 6
about 6
these 4
short 4
stories 6
give 6
you 6
the 4
opportunity 9
to 6
understand 8
big 6
ideas 5
in 6
context 6


In [60]:
dist1=53
def min_dist(l1,dist1):
    for i in l1:
        dist=levenshtein('cherry', i)
        if dist<=dist1:
            dist1=dist
        else:
            continue
    print([x for x in l1 if levenshtein('cherry', x)==dist1])
    print(dist1)

In [73]:
min_dist(tok,dist1)

['them', 'the', 'these', 'short']
4


In [44]:
from textdistance import levenshtein #will use this with pandas to reduce time

In [51]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer=WordNetLemmatizer()
input_str=' '.join(tok)
input_str=word_tokenize(input_str)
for word in input_str:
    print(lemmatizer.lemmatize(word))

language
is
le
about
word
and
more
about
the
meaning
behind
them
if
you
spend
all
your
time
learning
vocabulary
and
grammar
you
will
never
be
able
to
fluently
speak
a
language
because
you
will
have
little
to
talk
about
these
short
story
give
you
the
opportunity
to
understand
big
idea
in
context


In [46]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Shreya
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [124]:
dist1=100
word='cherry'
def min_dist_fuzz(l1,dist1,word):
    for i in l1:
        distant = SequenceMatcher(None,i, word)
        rate=distant.ratio()
        if rate<=dist1:
            dist1=rate
        else:
            continue
    print([x for x in l1 if SequenceMatcher(None,x, word).ratio()==dist1])
    print(distant)

In [127]:
from difflib import SequenceMatcher
m = SequenceMatcher(None,"NEW YORK METS", "NEW YORK MEATS")

In [128]:
m.ratio()

0.9629629629629629

In [72]:
min_dist_fuzz(tok,dist1,word)

['is', 'if', 'in', 'and', 'about', 'all', 'big', 'to', 'a', 'talk', 'will']
<difflib.SequenceMatcher object at 0x000000E3C5F672B0>


In [71]:
tok= set(tok) 

In [137]:
dist1=0
word='cherry'
def min_dist_fuzz_spar(l1,dist1,word):
    for i in l1:
        distant = fuzz.partial_ratio(i, word)
        if distant>=dist1:
            dist1=distant       
        else:
            continue
    print([x for x in l1 if fuzz.partial_ratio(x, word)==dist1])
    #print(dist1)

In [138]:
min_dist_fuzz_spar(tok,dist1,word)

['the']


In [110]:
from fuzzywuzzy import fuzz

In [106]:
pip install fuzzywuzzy

Collecting fuzzywuzzyNote: you may need to restart the kernel to use updated packages.
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy

Successfully installed fuzzywuzzy-0.18.0


In [139]:
min_dist_fuzz_spar(tok,dist1,'vocab')

['a', 'vocabulary']


In [140]:
min_dist_fuzz_spar(tok,dist1,'under')

['understand']


In [141]:
min_dist_fuzz_spar(tok,dist1,'sight')

['big', 'to']


In [142]:
min_dist_fuzz_spar(tok,dist1,'towards')

['to', 'a']


In [143]:
min_dist_fuzz_spar(tok,dist1,'blame')

['a']


In [144]:
min_dist_fuzz_spar(tok,dist1,'cherry')

['the']


In [145]:
min_dist_fuzz(tok,dist1,'merry')

['is', 'if', 'in', 'and', 'about', 'all', 'big', 'to', 'a', 'talk', 'will']
<difflib.SequenceMatcher object at 0x000000E3CA87C2E0>


In [147]:
min_dist_fuzz(tok,dist1,'cheered')

['is', 'if', 'in', 'about', 'all', 'big', 'to', 'a', 'you', 'talk', 'will']
<difflib.SequenceMatcher object at 0x000000E3CA87C220>


In [148]:
min_dist_fuzz(tok,dist1,'today') #Sparse is necessary

['is', 'if', 'in', 'big', 'be', 'give', 'never', 'will', 'less']
<difflib.SequenceMatcher object at 0x000000E3CA87C9D0>
