In [5]:
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import sklearn

In [6]:
file = open('data/12.08.txt').read()

In [7]:
custom_stopwords = ['versions',
                     'hindi',
                     'published',
                     'laying',
                     'mentioned',
                     'papers',
                     'august',
                     'statement',
                     'copy',
                     'rules',
                     'showing',
                     'sebi',
                     'delay',
                     'regulations',
                     'reasons',
                     'english',
                     'report',
                     'government',
                     'india',
                     'notification',
                     'amendment',
                     'comptroller',
                     'gazette',
                     'ladnro',
                     'union',
                     'general',
                     'auditor',
                     'dated']

In [8]:
sents = []
for line in file.split('\n'):
    if line != "":
        pattern = re.compile('[a-zA-Z]*')
        words = [word for word in pattern.findall(line) if len(word) > 3 and 
                                                           word.lower() not in stopwords.words('english') + custom_stopwords]
        if len(words) > 4:
            sents.append([word.lower() for word in words])

In [9]:
def get_word_frequency(sents):
    word_counter = {}
    for sent in sents:
        for word in sent:
            try:
                word_counter[word] += 1
            except:
                word_counter[word] = 1
    return word_counter

word_freq = get_word_frequency(sents)

In [11]:
kw_pair = []
for key in sorted(word_freq, key=word_freq.__getitem__, reverse=True):
    kw_pair.append((key, word_freq[key]))
kw_pair[:10]

[('shri', 38),
 ('securities', 35),
 ('exchange', 33),
 ('minister', 32),
 ('board', 32),
 ('aircraft', 28),
 ('year', 25),
 ('madam', 22),
 ('ministry', 19),
 ('indian', 18)]

In [14]:
bgs = nltk.ngrams([word for sent in sents for word in sent], 2)
fdist = nltk.FreqDist(bgs)
fdist.most_common()[:10]

[(('exchange', 'board'), 30),
 (('securities', 'exchange'), 30),
 (('disclosure', 'requirements'), 11),
 (('would', 'like'), 11),
 (('issue', 'capital'), 8),
 (('capital', 'disclosure'), 8),
 (('board', 'issue'), 7),
 (('corporation', 'limited'), 6),
 (('review', 'working'), 6),
 (('state', 'ministry'), 6)]

In [15]:
# try out collocations

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_words([word for sent in sents for word in sent])
finder.nbest(bigram_measures.likelihood_ratio, 10)

[('exchange', 'board'),
 ('securities', 'exchange'),
 ('disclosure', 'requirements'),
 ('would', 'like'),
 ('review', 'working'),
 ('capital', 'disclosure'),
 ('list', 'essential'),
 ('renewal', 'recognition'),
 ('issue', 'capital'),
 ('alongwith', 'audited')]

In [16]:
finder.nbest(bigram_measures.pmi, 10)

[('accepted', 'recommendation'),
 ('achievements', 'beneficial'),
 ('adding', 'either'),
 ('addition', 'bombardier'),
 ('addressed', 'challenged'),
 ('administrative', 'functioning'),
 ('adopt', 'updated'),
 ('adulterated', 'spoiled'),
 ('advantage', 'loopholes'),
 ('afford', 'costly')]

In [17]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = nltk.collocations.TrigramCollocationFinder.from_words([word for sent in sents for word in sent])
finder.nbest(trigram_measures.likelihood_ratio, 10)

[('securities', 'exchange', 'board'),
 ('exchange', 'board', 'issue'),
 ('exchange', 'board', 'substantial'),
 ('exchange', 'board', 'depositories'),
 ('exchange', 'board', 'mumbai'),
 ('exchange', 'board', 'listing'),
 ('exchange', 'board', 'delisting'),
 ('exchange', 'board', 'securities'),
 ('exchange', 'board', 'intermediaries'),
 ('exchange', 'board', 'mutual')]

In [None]:
from sklearn import decomposition

num_topics = 10
clf = decomposition.NMF(n_components=num_topics, random_state=1)

In [60]:
help(clf.fit_transform)

Help on method fit_transform in module sklearn.decomposition.nmf:

fit_transform(X, y=None, W=None, H=None) method of sklearn.decomposition.nmf.NMF instance
    Learn a NMF model for the data X and returns the transformed data.
    
    This is more efficient than calling fit followed by transform.
    
    Parameters
    ----------
    X: {array-like, sparse matrix}, shape (n_samples, n_features)
        Data matrix to be decomposed
    
    W : array-like, shape (n_samples, n_components)
        If init='custom', it is used as initial guess for the solution.
    
    H : array-like, shape (n_components, n_features)
        If init='custom', it is used as initial guess for the solution.
    
    Attributes
    ----------
    components_ : array-like, shape (n_components, n_features)
        Factorization matrix, sometimes called 'dictionary'.
    
    n_iter_ : int
        Actual number of iterations for the transform.
    
    Returns
    -------
    W: array, shape (n_samples, n_com

In [140]:
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer()

In [141]:
words = [word for sent in sents for word in sent]
sent2vec = count_vectorizer.fit_transform([str(' '.join(sent)) for sent in sents])

In [135]:
# from nltk.corpus import brown

# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.pipeline import FeatureUnion
# from sklearn.naive_bayes import MultinomialNB

# # Let's get more text from NLTK
# text = [" ".join(i) for i in brown.sents()[:100]]
# # I'm just gonna assign random tags.
# labels = ['yes']*50 + ['no']*50
# count_vectorizer = CountVectorizer(stop_words="english", min_df=3)
# tf_transformer = TfidfVectorizer(use_idf=True)
# combined_features = FeatureUnion([("counts", count_vectorizer), ("tfidf", tf_transformer)]).fit_transform(text)
# classifier = MultinomialNB()
# classifier.fit(combined_features, labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [143]:
vocab = pd.np.array(count_vectorizer.get_feature_names())

In [144]:
len(vocab)

1180

In [145]:
clf.fit_transform(sent2vec)

array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  2.06348584e-02,   2.41431151e-03,   7.75545783e-02, ...,
          1.82550893e-01,   7.81280366e-02,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   7.54152828e-02,   0.00000000e+00],
       ..., 
       [  2.03679735e-02,   0.00000000e+00,   1.62521593e-02, ...,
          0.00000000e+00,   0.00000000e+00,   1.24134082e-02],
       [  6.31347664e-05,   5.37454014e-03,   0.00000000e+00, ...,
          1.77491042e-03,   3.03745227e-03,   1.27721688e-03],
       [  2.37185588e-02,   0.00000000e+00,   2.01254362e-02, ...,
          0.00000000e+00,   0.00000000e+00,   1.35589001e-02]])

In [152]:
topic_words = []

for topic in clf.components_:
    word_idx = pd.np.argsort(topic)[::-1][0:5]
    topic_words.append([vocab[i] for i in word_idx])

In [153]:
topic_words

[['banks', 'whether', 'taken', 'public', 'shri'],
 ['aircraft', 'indian', 'deployed', 'three', 'search'],
 ['milk', 'technology', 'products', 'ministry', 'food'],
 ['minister', 'start', 'first', 'startups', 'capital'],
 ['national', 'list', 'essential', 'devices', 'medical'],
 ['securities', 'exchange', 'board', 'issue', 'requirements'],
 ['radar', 'flight', 'july', 'port', 'blair'],
 ['compulsory', 'cadets', 'member', 'come', 'lakh'],
 ['years', 'accidents', 'figure', 'accident', 'committee'],
 ['prices', 'stent', 'come', 'cent', 'going']]

In [154]:
# NMF seems to work quite better.