In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import numpy as np

In [2]:
df = pd.read_csv('../data/with_summary')
df.head()

Unnamed: 0,claps,reading_time,title,text,summary
0,8300,11,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T...","Building a bot for the sake of it, letting it ..."
1,1400,7,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...,The basic syntax of lambda functions is: Note ...
2,2800,11,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...,"For example, we have the month each client joi..."
3,1300,7,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...,"Software Consultant, Adjunct Professor, Publis..."
4,935,11,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...,A note about off-policy vs on-policy learning:...


In [3]:
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
import re

punct = list(string.punctuation)
sw = stopwords.words('english')

def pos_replace(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def bare_text(text):
    text = text.replace('\n','')
    text = text.lower()
    #Adds spaces where they are missing after punctuation
    text = re.sub(r'(?<=[.,\?!])(?=[^\s])', r' ', text)
    #Tokenize text
    text_token = word_tokenize(text)
    #Get rid of stopwords
    text_token = [w for w in text_token if w.lower() not in sw]
    #Lemmatize text
    text_token = pos_tag(text_token)
    text_token = [(w[0], pos_replace(w[1])) for w in text_token]
    lemmatizer = WordNetLemmatizer() 
    text_token = [lemmatizer.lemmatize(word[0], word[1]) for word in text_token]
    #Get rid of punctuation
    text_token = [w for w in text_token if w not in punct]
    #Special punctuation marks not included in original list
    text_token = [w for w in text_token if w not in ["’", "-", "‘"]]
    text = TreebankWordDetokenizer().detokenize(text_token)
    return text

def tokens(text):
    text = text.replace('\n','')
    text = text.lower()
    #Adds spaces where they are missing after punctuation
    text = re.sub(r'(?<=[.,\?!])(?=[^\s])', r' ', text)
    #Tokenize text
    text_token = word_tokenize(text)
    #Get rid of stopwords
    text_token = [w for w in text_token if w.lower() not in sw]
    #Lemmatize text
    text_token = pos_tag(text_token)
    text_token = [(w[0], pos_replace(w[1])) for w in text_token]
    lemmatizer = WordNetLemmatizer() 
    text_token = [lemmatizer.lemmatize(word[0], word[1]) for word in text_token]
    #Get rid of punctuation
    text_token = [w for w in text_token if w not in punct]
    #Special punctuation marks not included in original list
    text_token = [w for w in text_token if w not in ["’", "-", "‘"]]
    return text_token

In [4]:
df.summary = df.summary.apply(bare_text)

In [5]:
df2 = df.copy()
df2.text = df2.text.apply(bare_text)

In [5]:
text = df.text

# Topic Modeling with BERT
---
Code references from [here](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6)

Generate text embeddings using BERT

In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(text.text, show_progress_bar=True)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=11.0), HTML(value='')))




Dimentionality reduction using UMAP - optional if doing something that deals well with high dimentionality like k-Means

In [21]:
import umap

umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)

Cluster the documents

In [22]:
import hdbscan

cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

Class-based TF_IDF implementation (frequency by topic rather than document)

In [44]:
d = {'Doc': df.text, 'Topic': cluster.labels_}
docs_df = pd.DataFrame(data = d, columns = ['Doc', 'Topic'])
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [48]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, tokenizer = tokens).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(df))



Topic representation, 20 words that represent each topic

In [50]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)



Unnamed: 0,Topic,Size
0,-1,123
3,2,94
4,3,70
2,1,31
1,0,19


# SpaCy + NER

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')
  
article = df.text[1]
  
doc = nlp(article)
  
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

StackOverflow 218 231 ORG Companies, agencies, institutions, etc.
Python for Data Science and Machine Learning 1144 1188 ORG Companies, agencies, institutions, etc.
Youtube 1232 1239 GPE Countries, cities, states
Python 1386 1392 DATE Absolute or relative dates or periods
StackOverflow 1509 1522 ORG Companies, agencies, institutions, etc.
Python 1595 1601 GPE Countries, cities, states
NumPy 1603 1608 ORG Companies, agencies, institutions, etc.
Pandas 1614 1620 ORG Companies, agencies, institutions, etc.
one 2241 2244 CARDINAL Numerals that do not fall under another type
one 2434 2437 CARDINAL Numerals that do not fall under another type
Python 2477 2483 GPE Countries, cities, states
2 3155 3156 CARDINAL Numerals that do not fall under another type
Numpy 3443 3448 NORP Nationalities or religious or political groups
NumPy 3616 3621 ORG Companies, agencies, institutions, etc.
Linspace 3949 3957 PERSON People, including fictional
Linspace 4000 4008 PERSON People, including fictional
Pandas

In [21]:
#Add transform to entity list
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans

matcher = PhraseMatcher(nlp.vocab)

phrase_list = ['transforms', 'transform']
phrase_patterns = [nlp(text) for text in phrase_list]

matcher.add('TRANS', None, *phrase_patterns)

matches = matcher(doc)

TRANS = doc.vocab.strings[u'TRANS']
new_ents = [Span(doc, match[1], match[2], label = TRANS) for match in matches]
doc.ents = list(doc.ents) + new_ents

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

StackOverflow 218 231 ORG Companies, agencies, institutions, etc.
Python for Data Science and Machine Learning 1144 1188 ORG Companies, agencies, institutions, etc.
Youtube 1232 1239 GPE Countries, cities, states
Python 1386 1392 DATE Absolute or relative dates or periods
StackOverflow 1509 1522 ORG Companies, agencies, institutions, etc.
Python 1595 1601 GPE Countries, cities, states
NumPy 1603 1608 ORG Companies, agencies, institutions, etc.
Pandas 1614 1620 ORG Companies, agencies, institutions, etc.
one 2241 2244 CARDINAL Numerals that do not fall under another type
one 2434 2437 CARDINAL Numerals that do not fall under another type
Python 2477 2483 GPE Countries, cities, states
transforms 2989 2999 TRANS None
2 3155 3156 CARDINAL Numerals that do not fall under another type
Numpy 3443 3448 NORP Nationalities or religious or political groups
NumPy 3616 3621 ORG Companies, agencies, institutions, etc.
Linspace 3949 3957 PERSON People, including fictional
Linspace 4000 4008 PERSON Pe

In [22]:
from spacy import displacy

displacy.render(doc, style = 'ent', jupyter = True)