# Topics as Map or Index

After loading the data, we are going to run the SciKit-Learn's NMF algorithm which is both fast for smaller corpora and deterministic in its outcomes varying the number of components until we seem to have achieved a stable number.

We are including both main and other TED-curated events here so that we can see how much the separate events are part of the larger map or if they are distinct.

Or maybe this is a good time to try out **k-means clustering**?

### Imports and Data

In [None]:
# Imports, Functions, Stopwords
import pandas as pd, re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
import gensim, numpy as np
# from itertools import combinations


stopwords = set(stopwords.words('english'))

# parentheticals = [ "\(laughter\)", "\(applause\)", "\(music\)", "\(video\)", 
#                   "\(laughs\)", "\(applause ends\)", "\(audio\)", "\(singing\)", 
#                   "\(music ends\)", "\(cheers\)", "\(cheering\)", "\(recording\)", 
#                   "\(beatboxing\)", "\(audience\)", "\(guitar strum\)", 
#                   "\(clicks metronome\)", "\(sighs\)", "\(guitar\)", "\(marimba sounds\)", 
#                   "\(drum sounds\)" ]

# def remove_parens(text):
#     new_text = text
#     for rgx_match in parentheticals:
#         new_text = re.sub(rgx_match, ' ', new_text.lower(), flags=re.IGNORECASE)
#     return new_text

In [None]:
# Load the Data
df = pd.read_csv('../output/TEDall.csv')

# Grab the text of the talks
talks = df.text.tolist()

# Create some labels we can use later but remove the redundant parts of the URL
labels = [re.sub('https://www.ted.com/talks/', '',item) for item in df.public_url.tolist()]

## TF-IDF

In [None]:
# Parametize the vectorizer:
tfidf_vectorizer = TfidfVectorizer(stop_words = stopwords,
                                   min_df = 2, 
                                   max_df = 0.9)

# Vectorize our texts
tfidf = tfidf_vectorizer.fit_transform(talks)
tfidf.shape

In [None]:
terms = tfidf_vectorizer.get_feature_names()
print(f"Vocabulary has {len(terms)} distinct terms.")

In [None]:
import operator
def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
ranking = rank_terms( tfidf, terms )
for i, pair in enumerate( ranking[0:20] ):
    print(f"{i+1:2d} {pair[0]} ({pair[1]:.2f})")

## K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(
    init="random",
    n_clusters=25,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [None]:
kmeans.fit(tfidf)
print(kmeans.inertia_, kmeans.n_iter_)

In [None]:
# Now let's try it for a range of possible clusters:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(25, 101, 10):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(tfidf)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(25, 101, 10), sse)
plt.xticks(range(25, 101, 10))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

**Kmeans** doesn't turn up: the slight elbow at 45 doesn't show up when you run `Kneelocator`:

```python
rom kneed import KneeLocator

kl = KneeLocator(range(25, 101, 10), sse, curve="convex", direction="decreasing")
print(kl.elbow)

UserWarning: No knee/elbow found
None
```

## Topic Coherence

What follows is another way of approaching k through topic coherence as demonstrated by Derek Greene. The current implementation is from his notebook with the following steps:

1. Create the topic models
2. Build word embedding models
3. Select the number of topics

### 1. Create the Topic Models

In [None]:
kmin, kmax = 20, 80

In [None]:
topic_models = []
# try each value of k
for k in range(kmin, kmax + 1, 10):
    print("Applying NMF for k=%d ..." % k )
    # run NMF
    model = NMF( init="nndsvd", n_components=k ) 
    W = model.fit_transform( tfidf )
    H = model.components_    
    # store for later
    topic_models.append( (k,W,H) )

### 2. Build the Word Embedding Models

In [None]:
class TokenGenerator:
    def __init__( self, documents, stopwords ):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )

    def __iter__( self ):
        print("Building Word2Vec model ...")
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall( doc ):
                if tok in self.stopwords:
                    tokens.append( "<stopword>" )
                elif len(tok) >= 2:
                    tokens.append( tok )
            yield tokens

In [None]:
docgen = TokenGenerator( talks, stopwords )
# the model has 500 dimensions, the minimum document-term frequency is 20
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)

In [None]:
print( f"Model has {len(w2v_model.wv.vocab)} terms." )

In [None]:
w2v_model.save("../output/w2v_model.bin")

In [None]:
# To re-load this model, run
#w2v_model = gensim.models.Word2Vec.load("w2v-model.bin")

### 3. Select the Number of Topics

In [None]:
def calculate_coherence( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        # check each pair of terms
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]) )
        # get the mean for all pairs in this topic
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    # get the mean score across all topics
    return overall_coherence / len(term_rankings)

def get_descriptor( all_terms, H, topic_index, top ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    # now get the terms corresponding to the top-ranked indices
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( all_terms[term_index] )
    return top_terms

In [None]:
k_values = []
coherences = []
for (k,W,H) in topic_models:
    # Get all of the topic descriptors - the term_rankings, based on top 10 terms
    term_rankings = []
    for topic_index in range(k):
        term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
    # Now calculate the coherence based on our Word2vec model
    k_values.append( k )
    coherences.append( calculate_coherence( w2v_model, term_rankings ) )
    print(f"K={k}: Coherence={coherences[-1]:.4f}")

## NMF

With 45 as the most slightly suggested starting point, we run the NMF decomposition bracketing it with 35 and 55 and then hand inspecting the results.

In [None]:
# First a function to make printing the most associated words with a topic:
def print_keywords(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"{topic_idx},"+" ".join([feature_names[i] 
                                            for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

In [None]:
import csv

In [None]:
# Currently not working:
# NameError: name 'topic_idx' is not defined
def save_to_csv(model, feature_names, keywords):
    with open(f'../output/topics-NMF-{n_components}.csv', mode='w') as the_file:
        writer = csv.writer(the_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(f"{topic_idx},"+" ".join([feature_names[i] 
                                              for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [None]:
n_top_words = 15
n_components = 55

# Here's our parameters
nmf = NMF(n_components = n_components, 
          init="nndsvd"
         )
model = nmf.fit(tfidf)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [None]:
# We used this to output the topics and then copy and paste them into a CSV.
# print_keywords(nmf, tfidf_feature_names, n_top_words)

In [None]:
W = model.fit_transform(X)
H = model.components_

In [None]:
# Create a dataframe from the resulting array
X = vec.fit_transform(texts)
term_matrix = pd.DataFrame(X.todense(), columns=vec.get_feature_names())
term_matrix.shape