# TEXT PROCESSING PRACTICE

#### Convert text data to numbers so that they can be fed to mathematical models.

### Step 1) TEXT PREPARATION
Cleaning (remove punctuation, convert all words to upper / lower case, 
remove formatting e.g. html tags)

Stemming VS Lemmatization:
- Stemming cuts off prefixes and/or ends of words; quicker but less precise
- Lemmatization considers context and returns actual word in vocabulary

In [2]:
import nltk
import numpy as np
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

file = open('space_invaders.txt', encoding='utf-8')
doc = file.read()
file.close()

In [3]:
#Tokenization (into sentences)
sentences = nltk.sent_tokenize(doc)
print(sentences)

['Space Invaders is a fixed shooter in which the player controls a laser cannon by moving it horizontally across the bottom of the screen and firing at descending aliens.', 'The aim is to defeat five rows of eleven aliens—although some versions feature different numbers—that move horizontally back and forth across the screen as they advance toward the bottom of the screen.', "The player's laser cannon is partially protected by several stationary defense bunkers—the number also varies by version—that are gradually destroyed from the top and bottom by blasts from either the aliens or the player.", 'The player defeats an alien and earns points by shooting it with the laser cannon.', "As more aliens are defeated, the aliens' movement and the game's music both speed up.", 'Defeating all the aliens on-screen brings another wave that is more difficult, a loop which can continue endlessly.', 'A special "mystery ship" will occasionally move across the top of the screen and award bonus points if

In [5]:
#Tokenization (into words)
from nltk import word_tokenize

words = word_tokenize(doc)
print(words)

['Space', 'Invaders', 'is', 'a', 'fixed', 'shooter', 'in', 'which', 'the', 'player', 'controls', 'a', 'laser', 'cannon', 'by', 'moving', 'it', 'horizontally', 'across', 'the', 'bottom', 'of', 'the', 'screen', 'and', 'firing', 'at', 'descending', 'aliens', '.', 'The', 'aim', 'is', 'to', 'defeat', 'five', 'rows', 'of', 'eleven', 'aliens—although', 'some', 'versions', 'feature', 'different', 'numbers—that', 'move', 'horizontally', 'back', 'and', 'forth', 'across', 'the', 'screen', 'as', 'they', 'advance', 'toward', 'the', 'bottom', 'of', 'the', 'screen', '.', 'The', 'player', "'s", 'laser', 'cannon', 'is', 'partially', 'protected', 'by', 'several', 'stationary', 'defense', 'bunkers—the', 'number', 'also', 'varies', 'by', 'version—that', 'are', 'gradually', 'destroyed', 'from', 'the', 'top', 'and', 'bottom', 'by', 'blasts', 'from', 'either', 'the', 'aliens', 'or', 'the', 'player', '.', 'The', 'player', 'defeats', 'an', 'alien', 'and', 'earns', 'points', 'by', 'shooting', 'it', 'with', 'the

In [6]:
#Stemming
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

lm = WordNetLemmatizer()
print([lm.lemmatize(token) for token in word_tokenize(doc)])

['Space', 'Invaders', 'is', 'a', 'fixed', 'shooter', 'in', 'which', 'the', 'player', 'control', 'a', 'laser', 'cannon', 'by', 'moving', 'it', 'horizontally', 'across', 'the', 'bottom', 'of', 'the', 'screen', 'and', 'firing', 'at', 'descending', 'alien', '.', 'The', 'aim', 'is', 'to', 'defeat', 'five', 'row', 'of', 'eleven', 'aliens—although', 'some', 'version', 'feature', 'different', 'numbers—that', 'move', 'horizontally', 'back', 'and', 'forth', 'across', 'the', 'screen', 'a', 'they', 'advance', 'toward', 'the', 'bottom', 'of', 'the', 'screen', '.', 'The', 'player', "'s", 'laser', 'cannon', 'is', 'partially', 'protected', 'by', 'several', 'stationary', 'defense', 'bunkers—the', 'number', 'also', 'varies', 'by', 'version—that', 'are', 'gradually', 'destroyed', 'from', 'the', 'top', 'and', 'bottom', 'by', 'blast', 'from', 'either', 'the', 'alien', 'or', 'the', 'player', '.', 'The', 'player', 'defeat', 'an', 'alien', 'and', 'earns', 'point', 'by', 'shooting', 'it', 'with', 'the', 'laser

In [9]:
#alternative way for writing the above is to tokenize first, then lemmatize
print([lm.lemmatize(token) for token in words])

['Space', 'Invaders', 'is', 'a', 'fixed', 'shooter', 'in', 'which', 'the', 'player', 'control', 'a', 'laser', 'cannon', 'by', 'moving', 'it', 'horizontally', 'across', 'the', 'bottom', 'of', 'the', 'screen', 'and', 'firing', 'at', 'descending', 'alien', '.', 'The', 'aim', 'is', 'to', 'defeat', 'five', 'row', 'of', 'eleven', 'aliens—although', 'some', 'version', 'feature', 'different', 'numbers—that', 'move', 'horizontally', 'back', 'and', 'forth', 'across', 'the', 'screen', 'a', 'they', 'advance', 'toward', 'the', 'bottom', 'of', 'the', 'screen', '.', 'The', 'player', "'s", 'laser', 'cannon', 'is', 'partially', 'protected', 'by', 'several', 'stationary', 'defense', 'bunkers—the', 'number', 'also', 'varies', 'by', 'version—that', 'are', 'gradually', 'destroyed', 'from', 'the', 'top', 'and', 'bottom', 'by', 'blast', 'from', 'either', 'the', 'alien', 'or', 'the', 'player', '.', 'The', 'player', 'defeat', 'an', 'alien', 'and', 'earns', 'point', 'by', 'shooting', 'it', 'with', 'the', 'laser

Stop words - words that are very common and not valuable to models for differentiation purposes. 
We will remove them before doing further processing.

Note: there is no universal list of stop words; each Natural Language Processing(NLP) Tool has its own list.

In [10]:
#print list of stopwords by NLTK
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
#removing stop words
from nltk import word_tokenize
from nltk.corpus import stopwords

stops = stopwords.words('english')

tokens = [token for token in word_tokenize(doc) if token not in stops]
#print(tokens)

#further processing : remove punctuation
words = [word for word in tokens if word.isalpha() or word.isalnum()]
print(words)

['Space', 'Invaders', 'fixed', 'shooter', 'player', 'controls', 'laser', 'cannon', 'moving', 'horizontally', 'across', 'bottom', 'screen', 'firing', 'descending', 'aliens', 'The', 'aim', 'defeat', 'five', 'rows', 'eleven', 'versions', 'feature', 'different', 'move', 'horizontally', 'back', 'forth', 'across', 'screen', 'advance', 'toward', 'bottom', 'screen', 'The', 'player', 'laser', 'cannon', 'partially', 'protected', 'several', 'stationary', 'defense', 'number', 'also', 'varies', 'gradually', 'destroyed', 'top', 'bottom', 'blasts', 'either', 'aliens', 'player', 'The', 'player', 'defeats', 'alien', 'earns', 'points', 'shooting', 'laser', 'cannon', 'As', 'aliens', 'defeated', 'aliens', 'movement', 'game', 'music', 'speed', 'Defeating', 'aliens', 'brings', 'another', 'wave', 'difficult', 'loop', 'continue', 'endlessly', 'A', 'special', 'mystery', 'ship', 'occasionally', 'move', 'across', 'top', 'screen', 'award', 'bonus', 'points', 'destroyed', 'The', 'aliens', 'attempt', 'destroy', 'pl

In [17]:
#another example - removing stop words and punctuation
text = 'he likes cats and dogs, and teaching machines to "learn"!'

#remove stop words
words = [word for word in word_tokenize(text) if word not in stops]

#remove punctuation
only_words = [k for k in words if k.isalpha() or k.isalnum()]
print(only_words)

['likes', 'cats', 'dogs', 'teaching', 'machines', 'learn']


In [25]:
#another way to cleanse data
import string
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
#only need to do once per system: nltk.download('stopwords')
#only need to do once per system: nltk.download('wordnet')

sent_clean = []
punc = str.maketrans('', '', string.punctuation)
for sent in sentences:
    sent_no_punc = doc.translate(punc)
    words = sent_no_punc.lower().split()
    words = [lm.lemmatize(word, 'v')
            for word in words if word not in stops]
    sent_clean.append(' '.join(words))
    
print(sent_clean)

['space invaders fix shooter player control laser cannon move horizontally across bottom screen fire descend alien aim defeat five row eleven aliens—although versions feature different numbers—that move horizontally back forth across screen advance toward bottom screen players laser cannon partially protect several stationary defense bunkers—the number also vary version—that gradually destroy top bottom blast either alien player player defeat alien earn point shoot laser cannon alien defeat alien movement game music speed defeat alien onscreen bring another wave difficult loop continue endlessly special mystery ship occasionally move across top screen award bonus point destroy alien attempt destroy players cannon fire approach bottom screen reach bottom alien invasion declare successful game end tragically otherwise end generally players last cannon destroy enemys projectiles space invaders create japanese designer tomohiro nishikado spend year design game develop necessary hardware pr

## Step 2) GENERATE FEATURES FOR TEXT

Common techniques:
    - Bag of Words (BOW) : i.e. all unique words in a corpus
    - TF-IDF : extracting meaningful data from text; highlight meaningful words

#### BoW Steps...

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
#load text (here i will use the text from above)

#create countvectorizer object for transform
bow = CountVectorizer()

#tokenize and build the vocab (shows words present and no. of times they appear)
bow.fit(words)
print(bow.vocabulary_)

{'likes': 3, 'cats': 0, 'dogs': 1, 'teaching': 5, 'machines': 4, 'learn': 2}


In [27]:
#BoW Alternative:

#generate feature vectors using BoW
bow = CountVectorizer()
feature_vectors = bow.fit_transform(only_words).toarray()
vocab = bow.get_feature_names()

df = pd.DataFrame(data=feature_vectors, index = ['word1', 'word2', 'word3', 'word4', 'word5', 'word6'], columns=vocab)
print(df)

       cats  dogs  learn  likes  machines  teaching
word1     0     0      0      1         0         0
word2     1     0      0      0         0         0
word3     0     1      0      0         0         0
word4     0     0      0      0         0         1
word5     0     0      0      0         1         0
word6     0     0      1      0         0         0


#### TF-IDF...

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer


#load text

#create tfidfvectorizer object for transform
tfidf = TfidfVectorizer()
feature_vectors = tfidf.fit_transform(only_words).toarray()
vocab = tfidf.get_feature_names()

df = pd.DataFrame(data=feature_vectors, index=['word1', 'word2', 'word3', 'word4', 'word5', 'word6'], columns=vocab)
print(df)

       cats  dogs  learn  likes  machines  teaching
word1   0.0   0.0    0.0    1.0       0.0       0.0
word2   1.0   0.0    0.0    0.0       0.0       0.0
word3   0.0   1.0    0.0    0.0       0.0       0.0
word4   0.0   0.0    0.0    0.0       0.0       1.0
word5   0.0   0.0    0.0    0.0       1.0       0.0
word6   0.0   0.0    1.0    0.0       0.0       0.0


#### Cosine Similarity : after tfidf transforms documents into vectors, use cosine similarity to compare similarity between docs
    - the cosine similarity between 2 vectors(i.e. docs) is the cosine angle between them
    - similar cosine angle = docs are similar
    - cosine similarity is a measure of ORIENTATION, not magnitude!

#### Cosine Similarity steps...

In [33]:
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#create function for data pre-processing to use on both corpus and query string
def preprocess(docs):
    cleansed = []
    
    punc = str.maketrans('', '', string.punctuation)
    for doc in docs:
        doc_no_punc = doc.translate(punc)
        words = doc_no_punc.lower().split()
        words = [lm.lemmatize(word, 'v')
                for word in words if word not in stops]
        cleansed.append(' '.join(words))
        
    return cleansed

In [34]:
#define documents
docs = [ 'John has some cats.',
           'Cats, being cats, eat fish.',
           'I ate a big fish']

#define query that you want to compare against documents
query = ['cats and fish']

#apply your preprocessing function on both (docs and query)
clean_docs = preprocess(docs)
clean_query = preprocess(query)

#fit tfidf model using corpus (i.e. docs)
tfidf = TfidfVectorizer()
tfidf.fit(clean_docs)

#use fitted model to generate feature_vectors for both corpus and query string
fv_docs = tfidf.transform(clean_docs).toarray()
fv_query = tfidf.transform(clean_query).toarray()

df = pd.DataFrame(data=fv_query, index=['Query String'], columns=tfidf.get_feature_names())
print(df)

              big       cat  eat      fish  john
Query String  0.0  0.707107  0.0  0.707107   0.0


In [35]:
#perform cosine similarity for query string against docs
similarity = cosine_similarity(fv_query, fv_docs)
cos_sim = pd.DataFrame(data=similarity, index=['cosine similarity'], columns=['doc 1', 'doc 2', 'doc3'])
print(cos_sim)

                      doc 1     doc 2     doc3
cosine similarity  0.428046  0.866025  0.36618


#### * higher cosine similarity value => query and doc are more similar! *