### Preprocessing Steps

* Case normalization
* Special-case tokens
* Remove punctuation
* Tokenize on whitespace
* Stopword removal
* Spell correction
* Stemming
* (Synonym substition)
* Vectorization

In [1]:
import nltk

In [2]:
original_sentences = [
    "Sucky, move 3 spaces forward",
    "Rotate right please Sucky?",
    "i'd like you to move to (1,1)",
    "yo sucky, some gold at (1,1) please!",
    "Where are you?",
    "Fnid me some GOLD!"
]

In [3]:
sentences = original_sentences
sent2 = [s.lower() for s in sentences]
sent2

['sucky, move 3 spaces forward',
 'rotate right please sucky?',
 "i'd like you to move to (1,1)",
 'yo sucky, some gold at (1,1) please!',
 'where are you?',
 'fnid me some gold!']

In [4]:
import re

# Matches coordinates like (3,5)
recoord = re.compile('\(\d+,\d+\)')
# Matches strings of digits
renumber = re.compile('\d+')

def subst_special(line):
    line1 = recoord.sub('xxcoordxx', line)
    line2 = renumber.sub('xxnumberxx', line1)
    if line2[-1] == '?':
        line2 += ' xxqmarkxx'
    return line2


In [5]:
sent3 = [subst_special(s) for s in sent2]
sent3

['sucky, move xxnumberxx spaces forward',
 'rotate right please sucky? xxqmarkxx',
 "i'd like you to move to xxcoordxx",
 'yo sucky, some gold at xxcoordxx please!',
 'where are you? xxqmarkxx',
 'fnid me some gold!']

In [6]:
import string
def remove_punct(str):
    return "".join([char for char in str if char not in string.punctuation])

In [7]:
sent4 = [remove_punct(s) for s in sent3]
sent4

['sucky move xxnumberxx spaces forward',
 'rotate right please sucky xxqmarkxx',
 'id like you to move to xxcoordxx',
 'yo sucky some gold at xxcoordxx please',
 'where are you xxqmarkxx',
 'fnid me some gold']

In [8]:
sent5 = [s.split() for s in sent4]
sent5

[['sucky', 'move', 'xxnumberxx', 'spaces', 'forward'],
 ['rotate', 'right', 'please', 'sucky', 'xxqmarkxx'],
 ['id', 'like', 'you', 'to', 'move', 'to', 'xxcoordxx'],
 ['yo', 'sucky', 'some', 'gold', 'at', 'xxcoordxx', 'please'],
 ['where', 'are', 'you', 'xxqmarkxx'],
 ['fnid', 'me', 'some', 'gold']]

In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words += ['sucky']

def remove_stopwords(words):
    return [word for word in words if word not in stop_words]


In [12]:
len(stop_words)

180

In [10]:
sent6 = [remove_stopwords(s) for s in sent5]
sent6

[['move', 'xxnumberxx', 'spaces', 'forward'],
 ['rotate', 'right', 'please', 'xxqmarkxx'],
 ['id', 'like', 'move', 'xxcoordxx'],
 ['yo', 'gold', 'xxcoordxx', 'please'],
 ['xxqmarkxx'],
 ['fnid', 'gold']]

In [16]:
len(words.words())

236736

In [17]:
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
nltk.download('words')

# Recognizes "special tokens" so they won't be spell corrected
special = re.compile('xx\w+xx')

def correct(word):
    if special.match(word):
        return word
    if word in words.words():
        return word
    else:
        print(f"Correcting {word}")
        distances = ((edit_distance(word, realword), realword) for realword in words.words())
        closest = min(distances)
        print(f"Got {closest}")
        return closest[1]
    
def spell_correct(words):
    return [correct(word) for word in words]


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\hanks\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [18]:
sent7 = [spell_correct(s) for s in sent6]
sent7

Correcting spaces
Got (1, 'space')
Correcting fnid
Got (1, 'Enid')


[['move', 'xxnumberxx', 'space', 'forward'],
 ['rotate', 'right', 'please', 'xxqmarkxx'],
 ['id', 'like', 'move', 'xxcoordxx'],
 ['yo', 'gold', 'xxcoordxx', 'please'],
 ['xxqmarkxx'],
 ['Enid', 'gold']]

In [19]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def stem(words):
    return [porter.stem(word) for word in words]


In [20]:
sent8 = [stem(s) for s in sent7]
sent8

[['move', 'xxnumberxx', 'space', 'forward'],
 ['rotat', 'right', 'pleas', 'xxqmarkxx'],
 ['id', 'like', 'move', 'xxcoordxx'],
 ['yo', 'gold', 'xxcoordxx', 'pleas'],
 ['xxqmarkxx'],
 ['enid', 'gold']]

#### Vectorization

In [21]:
sentSubst = [subst_special(s) for s in original_sentences]
sentSubst

['Sucky, move xxnumberxx spaces forward',
 'Rotate right please Sucky? xxqmarkxx',
 "i'd like you to move to xxcoordxx",
 'yo sucky, some gold at xxcoordxx please!',
 'Where are you? xxqmarkxx',
 'Fnid me some GOLD!']

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
dfVec = pd.DataFrame(cv.fit_transform(sentSubst).toarray(), columns = cv.get_feature_names())

In [23]:
dfVec.head()

Unnamed: 0,are,at,fnid,forward,gold,like,me,move,please,right,...,some,spaces,sucky,to,where,xxcoordxx,xxnumberxx,xxqmarkxx,yo,you
0,0,0,0,1,0,0,0,1,0,0,...,0,1,1,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,1,0,0,...,0,0,0,2,0,1,0,0,0,1
3,0,1,0,0,1,0,0,0,1,0,...,1,0,1,0,0,1,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
