In [130]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import spacy
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation

In [111]:
nltk.download('punkt')
'''
Loading small english model
'''
nlp = spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Srujan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [112]:
original_doc = ["Hi! This is trump. I'm participating in elections.",
             "Hi this is bolt. I'm the fastest man n earth",
             "Hi this is angelina jolie. I'm an actor.",
             "caperio acted very well in titanic.",
             "Joey is a very good in movies",
             "Virat kohli has many fans",
             "Virat kohli is a very good cricket player",
             "yuvraj hit 6 sixes of six balls",
             "prudvi dont have balls to talk to a girl",
             "srujan is creative",
             "srujan is genious",
             "srujan is friends with sid, harsha, lakshmi, prudvi,likitha, pushyami, niharika",
            ]


In [113]:
def tokenize(text):
    tokens = [token for token in nltk.word_tokenize(text) if (len(token) > 3 and len(token.strip('Xx/')) > 3 and len(re.sub('\d+', '', token.strip('Xx/'))) > 3) ]
    tokens = [token.lower() for token in tokens]
    text = nlp(text)
    tokens = [token for token in text if not token.is_punct and not token.is_stop]
    tokens = [token.lemma_ for token in tokens]
    return tokens

In [114]:
documents = [tokenize(text) for text in documents]
documents = [" ".join(text) for text in documents]

In [141]:
vectorizer = TfidfVectorizer(stop_words=None,min_df = 1, max_df=0.75, max_features=10, lowercase=False, ngram_range=(1,2))
tfidf_vectors = vectorizer.fit_transform(documents) 

In [142]:
tfidf_vectors.A

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.57735027, 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.        ],
       [0.5       , 0.5       , 0.        , 0.        , 0.5       ,
        0.        , 0.5       , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 1.       

In [143]:
vectorizer.get_feature_names()

['Virat',
 'Virat kohli',
 'ball',
 'girl',
 'good',
 'hi',
 'kohli',
 'prudvi',
 'prudvi ball',
 'srujan']

# Matrix factorization

In [144]:
clf = decomposition.NMF(n_components=3, random_state=111)

W1 = clf.fit_transform(tfidf_vectors)
H1 = clf.components_



In [145]:
H1

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        1.31607401, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.07406274, 0.0528619 , 0.        ,
        0.        , 0.        , 0.35551022, 0.0528619 , 1.21241784],
       [0.59460356, 0.59460356, 0.        , 0.        , 0.59460356,
        0.        , 0.59460356, 0.        , 0.        , 0.        ]])

In [146]:
W1

array([[0.75983569, 0.        , 0.        ],
       [0.75983569, 0.        , 0.        ],
       [0.75983569, 0.        , 0.        ],
       [0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.42044821],
       [0.        , 0.        , 0.72823766],
       [0.        , 0.        , 0.84089642],
       [0.        , 0.04607558, 0.        ],
       [0.        , 0.15840041, 0.        ],
       [0.        , 0.75426392, 0.        ],
       [0.        , 0.75426392, 0.        ],
       [0.        , 0.66511413, 0.        ]])

In [147]:
# each component in H1 is a prob distribution which says how much a word belomgs to that topic. Now we are taking top 3 words
# from each component.

num_words=3

vocab = np.array(vectorizer.get_feature_names())

top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]

In [148]:
topics

['hi srujan prudvi ball', 'srujan prudvi ball', 'kohli Virat kohli Virat']

In [149]:
colnames = ["Topic" + str(i) for i in range(clf.n_components)]
docnames = ["Doc" + str(i) for i in range(len(documents))]
df_doc_topic = pd.DataFrame(np.round(W1,2), columns=colnames, index=docnames)
significant_topic = np.argmax(df_doc_topic.values, axis=1)
df_doc_topic['document'] = original_doc
df_doc_topic['dominant_topic'] = significant_topic

In [150]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,document,dominant_topic
Doc0,0.76,0.0,0.0,Hi! This is trump. I'm participating in elections.,0
Doc1,0.76,0.0,0.0,Hi this is bolt. I'm the fastest man n earth,0
Doc2,0.76,0.0,0.0,Hi this is angelina jolie. I'm an actor.,0
Doc3,0.0,0.0,0.0,caperio acted very well in titanic.,0
Doc4,0.0,0.0,0.42,Joey is a very good in movies,2
Doc5,0.0,0.0,0.73,Virat kohli has many fans,2
Doc6,0.0,0.0,0.84,Virat kohli is a very good cricket player,2
Doc7,0.0,0.05,0.0,yuvraj hit 6 sixes of six balls,1
Doc8,0.0,0.16,0.0,prudvi dont have balls to talk to a girl,1
Doc9,0.0,0.75,0.0,srujan is creative,1


# Latent dirchlet allocation from sklearn

In [151]:
clf = LatentDirichletAllocation(n_components=3)
preds = clf.fit_transform(tfidf_vectors)
preds

array([[0.66554614, 0.16730171, 0.16715215],
       [0.66554614, 0.16730171, 0.16715215],
       [0.66554614, 0.16730171, 0.16715215],
       [0.33333333, 0.33333333, 0.33333333],
       [0.16734397, 0.16775108, 0.66490495],
       [0.12268564, 0.12309265, 0.7542217 ],
       [0.11167523, 0.11201399, 0.77631078],
       [0.16720458, 0.66546956, 0.16732587],
       [0.11614672, 0.77142396, 0.11242932],
       [0.66539353, 0.16738906, 0.16721741],
       [0.66539353, 0.16738906, 0.16721741],
       [0.70722456, 0.15337682, 0.13939862]])

In [152]:
clf.components_

array([[0.33440636, 0.33440636, 0.33467311, 0.33426376, 0.33500843,
        3.32663086, 0.33440636, 1.05771707, 0.33426376, 2.98886012],
       [0.33505049, 0.33505049, 1.79108244, 0.86781741, 0.33601808,
        0.33713054, 0.33505049, 0.81583097, 0.86781741, 0.33713344],
       [1.40789342, 1.40789342, 0.33493927, 0.33435175, 1.8289735 ,
        0.33623859, 1.40789342, 0.33661137, 0.33435175, 0.33605088]])

In [153]:
num_words=3

vocab = np.array(vectorizer.get_feature_names())

top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in clf.components_])
topics = [' '.join(t) for t in topic_words]

In [154]:
topics

['hi srujan prudvi', 'ball prudvi ball girl', 'good Virat kohli']

In [157]:
colnames = ["Topic" + str(i) for i in range(clf.n_components)]
docnames = ["Doc" + str(i) for i in range(len(documents))]
df_doc_topic = pd.DataFrame(np.round(preds,2), columns=colnames, index=docnames)
significant_topic = np.argmax(df_doc_topic.values, axis=1)
df_doc_topic['document'] = original_doc
df_doc_topic['dominant_topic'] = significant_topic

In [158]:
df_doc_topic

Unnamed: 0,Topic0,Topic1,Topic2,document,dominant_topic
Doc0,0.67,0.17,0.17,Hi! This is trump. I'm participating in elections.,0
Doc1,0.67,0.17,0.17,Hi this is bolt. I'm the fastest man n earth,0
Doc2,0.67,0.17,0.17,Hi this is angelina jolie. I'm an actor.,0
Doc3,0.33,0.33,0.33,caperio acted very well in titanic.,0
Doc4,0.17,0.17,0.66,Joey is a very good in movies,2
Doc5,0.12,0.12,0.75,Virat kohli has many fans,2
Doc6,0.11,0.11,0.78,Virat kohli is a very good cricket player,2
Doc7,0.17,0.67,0.17,yuvraj hit 6 sixes of six balls,1
Doc8,0.12,0.77,0.11,prudvi dont have balls to talk to a girl,1
Doc9,0.67,0.17,0.17,srujan is creative,0
