# From Tokens to Numbers: The Document-Term Matrix

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

# Path & Counter
from pathlib import Path
from collections import Counter

# ScyPy
from scipy import sparse
from scipy.spatial.distance import pdist

# Sys
import sys

# Warnings
import warnings

# IPywidget
from ipywidgets import interact, FloatRangeSlider

# SpyCy
import spacy

# Scikit-Learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [2]:
%matplotlib inline

In [3]:
np.random.seed(42)

sns.set_style('white')

warnings.filterwarnings('ignore')

### Loading BBC Data

In [4]:
path = Path('..', 'data', 'bbc')

files = sorted(list(path.glob('**/*.txt')))

doc_list = []

for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]]).strip()
    doc_list.append([topic, heading, body])

#### DataFrame Conversion

In [6]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    0 non-null      object
 1   heading  0 non-null      object
 2   body     0 non-null      object
dtypes: object(3)
memory usage: 132.0+ bytes


#### Inspecting Results

In [9]:
docs.sample(10)

#### Data Drawn from 5 different Categories

In [None]:
docs.topic.value_counts(normalize=True).to_frame('count').style.format({'count': '{:,.2%}'.format})

### Exploring Corpus

#### Token Count via Counter()

In [16]:
word_count = docs.body.str.split().str.len().sum()

print(f'Total word count: {word_count:,d} | per article: {word_count/len(docs):,.0f}')

In [17]:
token_count = Counter()

for i, doc in enumerate(docs.body.tolist(), 1):
    if i % 500 == 0:
        print(i, end=' ', flush=True)
    token_count.update([t.strip() for t in doc.split()])

In [18]:
tokens = (pd.DataFrame(token_count.most_common(), columns=['token', 'count'])
          .set_index('token')
          .squeeze())

In [19]:
n = 50
(tokens
 .iloc[:50]
 .plot
 .bar(figsize=(14, 4), title=f'Most frequent {n} of {len(tokens):,d} tokens'))
sns.despine()
plt.tight_layout();
plt.show()

### Document-Term Matrix with `CountVectorizer`

#### Key Parameters

In [20]:
print(CountVectorizer().__doc__)

#### Document Frequency Distribution

In [26]:
binary_vectorizer = CountVectorizer(max_df=1.0,
                                    min_df=1,
                                    binary=True)

binary_dtm = binary_vectorizer.fit_transform(docs.body)

In [27]:
binary_dtm

In [28]:
n_docs, n_tokens = binary_dtm.shape

In [29]:
tokens_dtm = binary_vectorizer.get_feature_names()

### `CountVectorizer` skips certain Tokens by Default

In [30]:
tokens.index.difference(pd.Index(tokens_dtm))

#### Persisting Result

In [31]:
results_path = Path('results', 'bbc')

if not results_path.exists():
    results_path.mkdir(parents=True)

In [32]:
dtm_path = results_path / 'binary_dtm.npz'

if not dtm_path.exists():
    sparse.save_npz(dtm_path, binary_dtm)

In [33]:
token_path = results_path / 'tokens.csv'

if not token_path.exists():
    pd.Series(tokens_dtm).to_csv(token_path, index=False)
else:
    tokens = pd.read_csv(token_path, header=None, squeeze=True)

In [34]:
doc_freq = pd.Series(np.array(binary_dtm.sum(axis=0)).squeeze()).div(n_docs)

max_unique_tokens = np.array(binary_dtm.sum(axis=1)).squeeze().max()

### `min_df` vs `max_df`: Interactive Visualization

In [35]:
df_range = FloatRangeSlider(value=[0.0, 1.0],
                            min=0,
                            max=1,
                            step=0.0001,
                            description='Doc. Freq.',
                            disabled=False,
                            continuous_update=True,
                            orientation='horizontal',
                            readout=True,
                            readout_format='.1%',
                            layout={'width': '800px'})

@interact(df_range=df_range)
def document_frequency_simulator(df_range):
    min_df, max_df = df_range
    keep = doc_freq.between(left=min_df, right=max_df)
    left = keep.sum()

    fig, axes = plt.subplots(ncols=2, figsize=(14, 6))

    updated_dtm = binary_dtm.tocsc()[:, np.flatnonzero(keep)]
    unique_tokens_per_doc = np.array(updated_dtm.sum(axis=1)).squeeze()
    sns.distplot(unique_tokens_per_doc, ax=axes[0], kde=False, norm_hist=False)
    axes[0].set_title('Unique Tokens per Doc')
    axes[0].set_yscale('log')
    axes[0].set_xlabel('# Unique Tokens')
    axes[0].set_ylabel('# Documents (log scale)')
    axes[0].set_xlim(0, max_unique_tokens)    
    axes[0].yaxis.set_major_formatter(ScalarFormatter())

    term_freq = pd.Series(np.array(updated_dtm.sum(axis=0)).squeeze())
    sns.distplot(term_freq, ax=axes[1], kde=False, norm_hist=False)
    axes[1].set_title('Document Frequency')
    axes[1].set_ylabel('# Tokens')
    axes[1].set_xlabel('# Documents')
    axes[1].set_yscale('log')
    axes[1].set_xlim(0, n_docs)
    axes[1].yaxis.set_major_formatter(ScalarFormatter())

    title = f'Document/Term Frequency Distribution | # Tokens: {left:,d} ({left/n_tokens:.2%})'
    fig.suptitle(title, fontsize=14)
    sns.despine()
    fig.tight_layout()
    fig.subplots_adjust(top=.9)

### Most Similar Documents

In [36]:
m = binary_dtm.todense()

pairwise_distances = pdist(m, metric='cosine')

In [37]:
closest = np.argmin(pairwise_distances)

In [38]:
rows, cols = np.triu_indices(n_docs)
rows[closest], cols[closest]

In [39]:
docs.iloc[6].to_frame(6).join(docs.iloc[245].to_frame(245)).to_csv(results_path / 'most_similar.csv')

In [40]:
docs.iloc[6]

In [41]:
pd.DataFrame(binary_dtm[[6, 245], :].todense()).sum(0).value_counts()

### Baseline Document-Term Matrix

In [42]:
vectorizer = CountVectorizer() 
doc_term_matrix = vectorizer.fit_transform(docs.body)
doc_term_matrix

In [43]:
doc_term_matrix.shape

#### Inspecting Tokens

In [44]:
words = vectorizer.get_feature_names()
words[:10]

#### Inspecting Doc-Term Matrix

In [45]:
doc_term_matrix_df = pd.DataFrame.sparse.from_spmatrix(doc_term_matrix, columns=words)
doc_term_matrix_df.head()

#### Most Frequent Terms

In [46]:
word_freq = doc_term_matrix_df.sum(axis=0).astype(int)
word_freq.sort_values(ascending=False).head() 

#### Computing Relative Term Frequency

In [51]:
vectorizer = CountVectorizer(binary=True)
doc_term_matrix = vectorizer.fit_transform(docs.body)
doc_term_matrix.shape

In [52]:
words = vectorizer.get_feature_names()
word_freq = doc_term_matrix.sum(axis=0)

# reduce to 1D array
word_freq_1d = np.squeeze(np.asarray(word_freq))

pd.Series(word_freq_1d, index=words).div(
    docs.shape[0]).sort_values(ascending=False).head(10)

#### Visualizing Doc-Term Matrix

In [53]:
sns.heatmap(pd.DataFrame(doc_term_matrix.todense(), columns=words), cmap='Blues')
plt.gcf().set_size_inches(14, 8);
plt.show()

### Using Thresholds to Reduce The Number of Tokens

In [55]:
vectorizer = CountVectorizer(max_df=.2, min_df=3, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(docs.body)
doc_term_matrix.shape

### Use CountVectorizer with Lemmatization

#### Building a Custom `tokenizer` for Lemmatization with `spacy`

In [57]:
nlp = spacy.load('en')
def tokenizer(doc):
    return [w.lemma_ for w in nlp(doc) 
                if not w.is_punct | w.is_space]

In [58]:
vectorizer = CountVectorizer(tokenizer=tokenizer, binary=True)
doc_term_matrix = vectorizer.fit_transform(docs.body)
doc_term_matrix.shape

In [59]:
lemmatized_words = vectorizer.get_feature_names()
word_freq = doc_term_matrix.sum(axis=0)
word_freq_1d = np.squeeze(np.asarray(word_freq))
word_freq_1d = pd.Series(word_freq_1d, index=lemmatized_words).div(docs.shape[0])
word_freq_1d.sort_values().tail(20)

### Document-Term Matrix with `TfIDFVectorizer`

#### Key Parameters

In [60]:
print(TfidfTransformer().__doc__)

### How Term Frequency - Inverse Document Frequency Works

In [61]:
sample_docs = ['call you tomorrow', 
                'Call me a taxi', 
                'please call me... PLEASE!']

#### Computing Term Frequency

In [62]:
vectorizer = CountVectorizer()
tf_dtm = vectorizer.fit_transform(sample_docs).todense()
tokens = vectorizer.get_feature_names()

In [63]:
term_frequency = pd.DataFrame(data=tf_dtm, 
                              columns=tokens)
print(term_frequency)

#### Computing Document Frequency

In [64]:
vectorizer = CountVectorizer(binary=True)

df_dtm = vectorizer.fit_transform(sample_docs).todense().sum(axis=0)

In [65]:
document_frequency = pd.DataFrame(data=df_dtm,
                                  columns=tokens)

print(document_frequency)

#### Computing TfIDF

#### Computing TfIDF

In [67]:
tfidf = pd.DataFrame(data=tf_dtm/df_dtm, columns=tokens)
print(tfidf)

### The Smoothing Effect

In [68]:
vect = TfidfVectorizer(smooth_idf=True, 
                       norm='l2',            
                       sublinear_tf=False,   
                       binary=False)
print(pd.DataFrame(vect.fit_transform(sample_docs).todense(), 
             columns=vect.get_feature_names()))

#### TfIDF with New Articles

In [69]:
tfidf = TfidfVectorizer(stop_words='english')
dtm_tfidf = tfidf.fit_transform(docs.body)
tokens = tfidf.get_feature_names()
dtm_tfidf.shape

In [70]:
token_freq = (pd.DataFrame({'tfidf': dtm_tfidf.sum(axis=0).A1,
                            'token': tokens})
              .sort_values('tfidf', ascending=False))

In [71]:
token_freq.head(10).append(token_freq.tail(10)).set_index('token')

### Summarizing News Articles using TfIDF Weights

#### Random Article Selection

In [72]:
article = docs.sample(1).squeeze()
article_id = article.name

In [73]:
print(f'Topic:\t{article.topic.capitalize()}\n\n{article.heading}\n')
print(article.body.strip())

#### Selecting Most Relevant Tokens by tfidf Value

In [74]:
article_tfidf = dtm_tfidf[article_id].todense().A1
article_tokens = pd.Series(article_tfidf, index=tokens)
article_tokens.sort_values(ascending=False).head(10)

#### Comparing to Random Selection

In [75]:
pd.Series(article.body.split()).sample(10).tolist()

### Creating Train & Test Sets

#### Stratified `train_test_split`

In [76]:
train_docs, test_docs = train_test_split(docs, 
                                         stratify=docs.topic, 
                                         test_size=50, 
                                         random_state=42)

In [77]:
train_docs.shape, test_docs.shape

In [78]:
pd.Series(test_docs.topic).value_counts()

#### Vectorizing train & test sets

In [79]:
vectorizer = CountVectorizer(max_df=.2, 
                             min_df=3, 
                             stop_words='english', 
                             max_features=2000)

train_dtm = vectorizer.fit_transform(train_docs.body)
words = vectorizer.get_feature_names()
# train_dtm

In [80]:
test_dtm = vectorizer.transform(test_docs.body)
test_dtm