[**Blueprints for Text Analysis Using Python**](https://github.com/blueprints-for-text-analytics-python/blueprints-text)  
Jens Albrecht, Sidharth Ramachandran, Christian Winkler

**If you like the book or the code examples here, please leave a friendly comment on [Amazon.com](https://www.amazon.com/Blueprints-Text-Analytics-Using-Python/dp/149207408X)!**
<img src="../rating.png" width="100"/>

# Chapter 5:<div class='tocSkip'/>

# Feature Engineering and Syntactic Similarity

## Remark<div class='tocSkip'/>

The code in this notebook differs slightly from the printed book. 

Several layout and formatting commands, like `figsize` to control figure size or subplot commands are removed in the book.

All of this is done to simplify the code in the book and put the focus on the important parts instead of formatting.

## Setup<div class='tocSkip'/>

Set directory locations. If working on Google Colab: copy files and install required libraries.

In [1]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    GIT_ROOT = 'https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master'
    os.system(f'wget {GIT_ROOT}/ch05/setup.py')

%run -i setup.py

You are working on a local system.
Files will be searched relative to "..".


## Load Python Settings<div class="tocSkip"/>

Common imports, defaults for formatting in Matplotlib, Pandas etc.

In [2]:
%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'png'

# Data preparation

In [3]:
sentences = ["It was the best of times", 
             "it was the worst of times", 
             "it was the age of wisdom", 
             "it was the age of foolishness"]

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]

vocabulary = set([w for s in tokenized_sentences for w in s])

import pandas as pd
[[w, i] for i,w in enumerate(vocabulary)]

[['worst', 0],
 ['of', 1],
 ['it', 2],
 ['wisdom', 3],
 ['It', 4],
 ['times', 5],
 ['best', 6],
 ['foolishness', 7],
 ['age', 8],
 ['the', 9],
 ['was', 10]]

# One-hot by hand

In [4]:
def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence) for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

[0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1]: It was the best of times
[1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1]: it was the worst of times
[0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1]: it was the age of wisdom
[0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1]: it was the age of foolishness


In [5]:
pd.DataFrame(onehot, columns=list(vocabulary))

Unnamed: 0,worst,of,it,wisdom,It,times,best,foolishness,age,the,was
0,0,1,0,0,1,1,1,0,0,1,1
1,1,1,1,0,0,1,0,0,0,1,1
2,0,1,1,1,0,0,0,0,1,1,1
3,0,1,1,0,0,0,0,1,1,1,1


In [6]:
sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sum(sim)

4

In [7]:
import numpy as np
np.dot(onehot[0], onehot[1])

4

In [8]:
np.dot(onehot, onehot[1])

array([4, 6, 4, 4])

## Out of vocabulary

In [9]:
onehot_encode("the age of wisdom is the best of times".split())

[0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0]

In [10]:
onehot_encode("John likes to watch movies. Mary likes movies too.".split())

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## document term matrix

In [11]:
onehot

[[0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1],
 [1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1],
 [0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1],
 [0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1]]

## similarities

In [12]:
import numpy as np
np.dot(onehot, np.transpose(onehot))

array([[6, 4, 3, 3],
       [4, 6, 4, 4],
       [3, 4, 6, 5],
       [3, 4, 5, 6]])

# scikit learn one-hot vectorization

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
lb = MultiLabelBinarizer()
lb.fit([vocabulary])
lb.transform(tokenized_sentences)

array([[1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1],
       [0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0]])

# CountVectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [15]:
more_sentences = sentences + ["John likes to watch movies. Mary likes movies too.",
                              "Mary also likes to watch football games."]
pd.DataFrame(more_sentences)

Unnamed: 0,0
0,It was the best of times
1,it was the worst of times
2,it was the age of wisdom
3,it was the age of foolishness
4,John likes to watch movies. Mary likes movies too.
5,Mary also likes to watch football games.


In [16]:
cv.fit(more_sentences)

In [17]:
print(cv.get_feature_names_out())

['age' 'also' 'best' 'foolishness' 'football' 'games' 'it' 'john' 'likes'
 'mary' 'movies' 'of' 'the' 'times' 'to' 'too' 'was' 'watch' 'wisdom'
 'worst']


In [18]:
dt = cv.transform(more_sentences)

In [19]:
dt

<6x20 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [20]:
pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0,0,1,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,0,0,1
2,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0
3,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,2,1,2,0,0,0,1,1,0,1,0,0
5,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(dt[0], dt[1])

array([[0.83333333]])

In [22]:
len(more_sentences)

6

In [23]:
pd.DataFrame(cosine_similarity(dt, dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.83,0.67,0.67,0.0,0.0
1,0.83,1.0,0.67,0.67,0.0,0.0
2,0.67,0.67,1.0,0.83,0.0,0.0
3,0.67,0.67,0.83,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.52
5,0.0,0.0,0.0,0.0,0.52,1.0


# TF/IDF

In [24]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)

In [25]:
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

Unnamed: 0,age,also,best,foolishness,football,games,it,john,likes,mary,movies,of,the,times,to,too,was,watch,wisdom,worst
0,0.0,0.0,0.57,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.47,0.0,0.0,0.34,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.47,0.0,0.0,0.34,0.0,0.0,0.57
2,0.47,0.0,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.0,0.0,0.0,0.34,0.0,0.57,0.0
3,0.47,0.0,0.0,0.57,0.0,0.0,0.34,0.0,0.0,0.0,0.0,0.34,0.34,0.0,0.0,0.0,0.34,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,0.5,0.25,0.61,0.0,0.0,0.0,0.25,0.31,0.0,0.25,0.0,0.0
5,0.0,0.42,0.0,0.0,0.42,0.42,0.0,0.0,0.34,0.34,0.0,0.0,0.0,0.0,0.34,0.0,0.0,0.34,0.0,0.0


In [30]:
pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.68,0.46,0.46,0.0,0.0
1,0.68,1.0,0.46,0.46,0.0,0.0
2,0.46,0.46,1.0,0.68,0.0,0.0
3,0.46,0.46,0.68,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.43
5,0.0,0.0,0.0,0.0,0.43,1.0


In [26]:
headlines = pd.read_csv(ABCNEWS_FILE, parse_dates=["publish_date"])
headlines.head()

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting licence
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers


In [31]:
# ABCNEWS contains over 1 million headlines
headlines.shape[0]

1103663

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [33]:
dt

<1103663x95878 sparse matrix of type '<class 'numpy.float64'>'
	with 7001357 stored elements in Compressed Sparse Row format>

In [42]:
# We can convert dt to a dense array to view its contents
dt[:10,:].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
dt.data.nbytes

56010856

In [35]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000])

CPU times: user 83.3 ms, sys: 234 ms, total: 317 ms
Wall time: 316 ms


array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.16913596,
        0.16792138],
       [0.        , 0.        , 0.        , ..., 0.16913596, 1.        ,
        0.33258708],
       [0.        , 0.        , 0.        , ..., 0.16792138, 0.33258708,
        1.        ]])

## Stopwords

In [46]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
print(len(stopwords))
tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(headlines["headline_text"])
dt

326


<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [45]:
type(stopwords)

set

## min_df

In [48]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x58527 sparse matrix of type '<class 'numpy.float64'>'
	with 5607113 stored elements in Compressed Sparse Row format>

In [50]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), min_df=.0001)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x6772 sparse matrix of type '<class 'numpy.float64'>'
	with 4816381 stored elements in Compressed Sparse Row format>

## max_df

In [52]:
tfidf = TfidfVectorizer(stop_words=list(stopwords), max_df=0.1)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95600 sparse matrix of type '<class 'numpy.float64'>'
	with 5644186 stored elements in Compressed Sparse Row format>

In [54]:
tfidf = TfidfVectorizer(max_df=0.05)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

<1103663x95873 sparse matrix of type '<class 'numpy.float64'>'
	with 6380678 stored elements in Compressed Sparse Row format>

## n-grams

In [55]:
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,3), min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
print(dt.shape)
print(dt.data.nbytes)

InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'‘ve', 'by', 'enough', 'its', 'was', 'up', 'other', 'between', 'mine', 'ourselves', 'somehow', 'already', "'d", 'how', 'thru', 'during', 'among', '’d', 'our', 'or', 'back', 'each', 'same', 'ca', 'became', 'throughout', 'even', 'less', 'could', 'i', '‘m', 'further', 'often', 'your', 'yet', 'to', 'call', 'forty', 'latter', 'hereafter', 'after', 'however', 'else', 'over', 'done', 'anyway', 'must', 'thus', 'meanwhile', 'alone', 'always', 'otherwise', 'now', 'formerly', 'twelve', 'doing', 'which', 'because', '‘re', 'do', 'sometime', 'hundred', 'more', 'n‘t', 'whenever', 'when', 'yourselves', '‘ll', 'below', 'here', 'who', 'anyone', 'next', "'s", 'nobody', 'many', 'whom', 'at', 'mostly', 'really', 'on', 'such', 'into', 'thereafter', 'besides', 'seeming', 'themselves', 're', 'himself', 'be', 'nine', 'few', 'he', 'if', 'due', 'while', 'others', 'those', 'there', 'nor', 'has', 'one', 'until', 'since', 'above', 'are', 'everything', 'nothing', 'why', 'last', 'well', 'before', 'part', 'say', 'across', 'two', 'us', 'amongst', 'might', 'amount', 'anywhere', 'their', 'yourself', 'me', 'hence', 'used', 'n’t', 'quite', 'four', 'give', 'moreover', 'therefore', 'will', 'within', 'anything', 'hereby', '‘s', 'perhaps', 'wherever', 'first', 'anyhow', 'sometimes', 'seemed', 'around', 'very', 'than', 'least', 'onto', 'off', 'somewhere', 'my', 'beyond', 'again', 'we', 'therein', "'ve", 'someone', 'fifty', 'him', 'were', 'neither', 'whence', 'top', '’s', 'none', 'been', 'down', 'only', 'whereupon', 'noone', 'no', 'does', 'move', 'what', 'per', 'is', "'ll", 'am', 'ours', 'have', 'every', 'another', 'but', 'whoever', 'it', '‘d', 'almost', 'keep', 'ever', 'whatever', "'re", 'former', 'indeed', 'although', 'latterly', 'front', 'should', 'hereupon', 'becomes', 'seem', 'third', 'rather', 'seems', 'that', 'yours', 'though', 'the', 'using', 'whose', 'may', 'along', 'several', 'without', 'did', 'from', 'put', 'with', 'behind', 'against', 'much', 'can', 'some', 'via', 'his', 'myself', 'something', 'either', 'take', 'whereas', 'together', 'sixty', 'where', 'this', 'make', 'name', 'and', 'whole', 'elsewhere', 'a', 'everywhere', 'any', '’m', 'everyone', "'m", 'fifteen', 'cannot', 'most', 'never', 'still', 'she', 'so', 'own', 'then', 'about', 'her', 'empty', 'hers', 'all', 'beside', 'thereby', 'eight', 'six', 'being', 'out', 'whither', 'thence', 'through', 'for', 'an', 'upon', 'full', 'side', 'become', 'as', 'get', 'once', 'show', 'becoming', 'afterwards', 'three', 'serious', 'just', 'wherein', 'itself', '’ll', 'thereupon', 'these', 'nowhere', 'would', 'you', 'towards', 'they', 'five', 'whereafter', 'twenty', 'go', '’ve', 'except', 'made', 'had', 'in', '’re', 'also', 'herein', 'them', 'whether', 'both', 'bottom', 'under', 'beforehand', 'nevertheless', 'ten', 'various', 'toward', 'too', 'please', 'not', 'herself', 'of', 'eleven', 'see', 'namely', 'whereby', 'unless', 'regarding', "n't"} instead.

## Lemmas

In [57]:
from tqdm.auto import tqdm
import spacy
nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]
for i, row in tqdm(headlines.iterrows(), total=len(headlines)):
    doc = nlp(str(row["headline_text"]))
    headlines.at[i, "lemmas"] = " ".join([token.lemma_ for token in doc])
    headlines.at[i, "nav"] = " ".join([token.lemma_ for token in doc if token.pos_ in nouns_adjectives_verbs])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
headlines.head()

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

## remove top 10,000

In [None]:
top_10000 = pd.read_csv("https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt", header=None)
tfidf = TfidfVectorizer(stop_words=set(top_10000.iloc[:,0].values))
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words=set(top_10000.iloc[:,0].values), min_df=2)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

## Finding document most similar to made-up document

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

In [None]:
made_up = tfidf.transform(["australia and new zealand discuss optimal apple size"])

In [None]:
sim = cosine_similarity(made_up, dt)

In [None]:
sim[0]

In [None]:
headlines.iloc[np.argsort(sim[0])[::-1][0:5]][["publish_date", "lemmas"]]

# Finding the most similar documents

In [None]:
# there are "test" headlines in the corpus
stopwords.add("test")
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, norm='l2')
dt = tfidf.fit_transform(headlines["headline_text"])

### Timing Cosine Similarity

In [None]:
%%time
cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)

In [None]:
%%time
r = cosine_similarity(dt[0:10000], dt[0:10000])
r[r > 0.9999] = 0
print(np.argmax(r))

In [None]:
%%time
r = cosine_similarity(dt[0:10000], dt[0:10000], dense_output=False)
r[r > 0.9999] = 0
print(np.argmax(r))

### Timing Dot-Product

In [None]:
%%time
r = np.dot(dt[0:10000], np.transpose(dt[0:10000]))
r[r > 0.9999] = 0
print(np.argmax(r))

## Batch

In [None]:
%%time
batch = 10000
max_sim = 0.0
max_a = None
max_b = None
for a in range(0, dt.shape[0], batch):
    for b in range(0, a+batch, batch):
        print(a, b)
        #r = np.dot(dt[a:a+batch], np.transpose(dt[b:b+batch]))
        r = cosine_similarity(dt[a:a+batch], dt[b:b+batch], dense_output=False)
        # eliminate identical vectors
        # by setting their similarity to np.nan which gets sorted out
        r[r > 0.9999] = 0
        sim = r.max()
        if sim > max_sim:
            # argmax returns a single value which we have to 
            # map to the two dimensions            
            (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)
            # adjust offsets in corpus (this is a submatrix)
            max_a += a
            max_b += b
            max_sim = sim

In [None]:
print(max_a, max_b)

In [None]:
print(max_sim)

In [None]:
pd.set_option('max_colwidth', -1)
headlines.iloc[[max_a, max_b]][["publish_date", "headline_text"]]

# Finding most related words

In [None]:
tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=1000)
dt_word = tfidf_word.fit_transform(headlines["headline_text"])

In [None]:
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [None]:
voc = tfidf_word.get_feature_names()
size = r.shape[0] # quadratic
for index in np.argsort(r.flatten())[::-1][0:40]:
    a = int(index/size)
    b = index%size
    if a > b:  # avoid repetitions
        print('"%s" related to "%s"' % (voc[a], voc[b]))