Testing out some NLP techniques using the in-built Wall Street Journal dataset in NLTK

In [13]:
import nltk
from nltk.stem.wordnet import *
from nltk.book import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem.snowball import PorterStemmer
default_stopwords = set(nltk.corpus.stopwords.words('english'))

https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial

## Topic Modeling with LDA

In [19]:
#WSJ_freq = nltk.FreqDist(text7)

stemmer = PorterStemmer()
lda = LatentDirichletAllocation(n_components=7, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')

tf = tf_vectorizer.fit_transform(text7)

In [20]:
# train lda
wsj_lda = lda.fit(tf)

In [21]:
n_top_words = 8

# Define helper function to print top words
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(wsj_lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:million program corp bond profit time analyst industry

Topic #1:market say trading sale rrb cent plan 30

Topic #2:company stock billion month investor bank buy ich

Topic #3:year new 000 future lrb 50 quarter service

Topic #4:said mr index business investment 10 rate contract

Topic #5:price president york day exchange rose term yesterday

Topic #6:share issue government executive house financial october trader


## Stemming

In [63]:
[ (stemmer.stem(w),f) for (w,f) in WSJ_freq.most_common(80) if w.lower() not in default_stopwords and w.isalpha()]

[('said', 628),
 ('million', 383),
 ('compani', 260),
 ('year', 212),
 ('say', 210),
 ('would', 209),
 ('market', 176),
 ('new', 165),
 ('new', 162),
 ('trade', 162),
 ('billion', 159),
 ('also', 147),
 ('stock', 136),
 ('presid', 133),
 ('one', 132)]

In [61]:
[ (w,f) for (w,f) in WSJ_freq.most_common(20) if w.lower() not in default_stopwords ]

[(',', 4885),
 ('.', 3828),
 ('*-1', 1123),
 ('0', 1099),
 ('*', 965),
 ("'s", 864),
 ('*T*-1', 806),
 ('*U*', 744),
 ('$', 718),
 ('``', 702),
 ("''", 684)]