#### Topic Modelling
Using Gensim's LDA model (Latent Dirichlet Allocation) to model topics in `newsgroup_data`, extract 10 topics

In [4]:
import pickle
import gensim
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer

drive.mount('/content/gdrive')
dir="/content/gdrive/My Drive/Colab Notebooks/NLP/"
!ls

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
gdrive	sample_data


In [11]:
# Load the list of documents
with open(dir + 'newsgroups', 'rb') as f: # rb - read in binary mode
  # byte stream converted to python object, list of len 2000
  newsgroup_data = pickle.load(f)

# Use CountVectorizor to find three letter tokens, remove stop_words, 
# remove tokens that don't appear in at least 20 documents, appear in more than 20% of the documents
vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english',
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform - X size (2000, 901)
X = vect.fit_transform(newsgroup_data)

# Convert sparse matrix to streaming gensim corpus (gensim.matutils.Sparse2Corpus)
# sklearn describes documents as rows
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

# vect.vocabulary_.items() returns {word:id} (To be used in LdaModel's id2word parameter)
id_word_map = dict((id, word) for word, id in vect.vocabulary_.items())
print("Vocab size", len(id_word_map))
{key: id_word_map[key] for key in list(id_word_map)[:10]}

Vocab size 901


{23: 'address',
 33: 'america',
 76: 'best',
 335: 'group',
 409: 'know',
 514: 'new',
 544: 'organization',
 726: 'similar',
 842: 'usa',
 899: 'york'}

In [12]:
# Create LDA model on the corpus

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=id_word_map, passes=25, random_state=34)
print(gensim.__version__)

3.6.0


#### lda_topics

*Returns list of 10 tuples (topic_id, string of prob and topmost words)*

In [13]:
def lda_topics():
    topic_list = ldamodel.show_topics(num_topics=10, num_words=10)    
    return topic_list

lda_topics()

[(0,
  '0.056*"edu" + 0.043*"com" + 0.033*"thanks" + 0.022*"mail" + 0.021*"know" + 0.020*"does" + 0.014*"info" + 0.012*"monitor" + 0.010*"looking" + 0.010*"don"'),
 (1,
  '0.024*"ground" + 0.018*"current" + 0.018*"just" + 0.013*"want" + 0.013*"use" + 0.011*"using" + 0.011*"used" + 0.010*"power" + 0.010*"speed" + 0.010*"output"'),
 (2,
  '0.061*"drive" + 0.042*"disk" + 0.033*"scsi" + 0.030*"drives" + 0.028*"hard" + 0.028*"controller" + 0.027*"card" + 0.020*"rom" + 0.018*"floppy" + 0.017*"bus"'),
 (3,
  '0.023*"time" + 0.015*"atheism" + 0.014*"list" + 0.013*"left" + 0.012*"alt" + 0.012*"faq" + 0.012*"probably" + 0.011*"know" + 0.011*"send" + 0.010*"months"'),
 (4,
  '0.025*"car" + 0.016*"just" + 0.014*"don" + 0.014*"bike" + 0.012*"good" + 0.011*"new" + 0.011*"think" + 0.010*"year" + 0.010*"cars" + 0.010*"time"'),
 (5,
  '0.030*"game" + 0.027*"team" + 0.023*"year" + 0.017*"games" + 0.016*"play" + 0.012*"season" + 0.012*"players" + 0.012*"win" + 0.011*"hockey" + 0.011*"good"'),
 (6,
  '0.0

#### topic_distribution, given a new document

*This function should return a list of tuples (topic_id, probability)*

In [0]:
new_doc = ["\n\nIt's my understanding that the freezing will start to occur because \
of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \
It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \
Krumins\n-- "]

In [16]:
def topic_distribution():
    # list to sparse matrix, for test doc, only transform
    X_test = vect.transform(new_doc)
    
    # matrix to gensim corpus
    corpus = gensim.matutils.Sparse2Corpus(X_test, documents_columns=False)
    
    # topic_list of type gensim.interfaces.TransformedCorpus
    # list of tuples (topic_id, probability) 
    topic_list = ldamodel.get_document_topics(corpus)
    #print(len(topic_list), list(topic_list))
    
    return list(topic_list)[0]

topic_distribution()

[(0, 0.02000183),
 (1, 0.020002047),
 (2, 0.020000001),
 (3, 0.49658147),
 (4, 0.020002764),
 (5, 0.020002853),
 (6, 0.020001695),
 (7, 0.020001367),
 (8, 0.020001847),
 (9, 0.3434041)]