In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [3]:
ds = pd.read_csv('quora_questions.csv')

ds.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# **Step 1: Create Model**

In [5]:
#Using the count vectorizer and tfidf transform in one step using TfidfVectorizer()
tfidf = TfidfVectorizer(max_df=0.95, min_df = 2, stop_words = 'english') #so max_df allows us to ignore certain tokens that have really high document frequency, so in our case we're saying ignore tokens that show up in 95% of documents
#min_df is the opposite, where we're saying a token has to show up at least in 2 different documents. If we selected a number between 0 and 1, it would've taken it as a percentage of the total number of documents instead, just like we did in max_df
#stop words just removes common words like the, a, etc

In [6]:
dtm = tfidf.fit_transform(ds['Question']) #Document Term Matrix, which counts each occurrence of each unique word throughout every single document (each text message) and add it to the DTM.
dtm #we got a sparse matrix with 404289 unique words across the 38669 questions

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [33]:
nnmf_model = NMF(n_components = 12) #creating an instance of our NMF import from above. n_components = 12 means we want 12 general topics returned

nnmf_model.fit(dtm) #fitting our document term matrix to our nnmf

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=12, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

# **Step 2: Grab the Vocabulary of Words**

In [34]:
print("features =", len(tfidf.get_feature_names())) #so we got 54777 features, which is just every single word
print("type =", type(tfidf.get_feature_names())) #we can see it's just a list of all our feature names. So since it's just a list, we can just grab index positions off the list to get the words
print("index 117 word is", tfidf.get_feature_names()[2401]) #grabbing index 2401

features = 38669
type = <class 'list'>
index 117 word is allusions


# **Step 3: Grab the Highest Coefficient Words per Topic**

In [35]:
for index, topic in enumerate(nnmf_model.components_):
  print(f"THE TOP 20 WORDS FOR TOPIC #{index}")
  print([tfidf.get_feature_names()[i] for i in topic.argsort()[-20:]]) #sorting the list from lowest to highest and grabbing the last (highest) 20
  print('\n')
  #so previously with LDA, we picked highest probability words, but with NMF we're picking highest coefficient value words
  #looks like topic 0 is about reviews, topic 3 is about earning money online, topic 7 is about presidential elections, topic 20 is about indian economics, etc

THE TOP 20 WORDS FOR TOPIC #0
['friend', 'read', 'thing', 'website', 'weight', 'place', 'visit', 'places', 'phone', 'buy', 'time', 'laptop', 'ways', 'movie', '2016', 'books', 'book', 'movies', 'way', 'best']


THE TOP 20 WORDS FOR TOPIC #1
['grads', 'majors', 'relationship', 'recruit', 'person', 'looking', 'exist', 'girl', 'look', 'compare', 'really', 'cost', 'time', 'sex', 'long', 'work', 'feel', 'like', 'mean', 'does']


THE TOP 20 WORDS FOR TOPIC #2
['writer', 'people', 'marked', 'search', 'use', 'add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 20 WORDS FOR TOPIC #3
['making', 'investment', 'website', 'job', 'using', 'friends', 'facebook', 'black', 'internet', 'free', 'easiest', 'home', 'easy', 'youtube', 'ways', 'way', 'earn', 'online', 'make', 'money']


THE TOP 20 WORDS FOR TOPIC #4
['employees', 'work', 'balance', 'earth', 'like', 'death', 'did', 'changed', 'want', 

# **Step 4: Attached Topic Numbers to the Questions**

In [39]:
topic_results = nnmf_model.transform(dtm) #applying a transform to our NMF using dtm

In [37]:
topic_results.shape #so we have the original 404289 articles along with 12 topics

(404289, 12)

In [43]:
topic_results[2]*100 #the array of index 1 is showing the coefficient distribution of a document (second article in this case) belonging to a particular topic. 
#We can see the Topic 3 has the highest coefficient at 0.26070159

array([0.05908917, 0.07069578, 0.17386785, 0.26070159, 0.01946388,
       0.0288865 , 0.02178018, 0.        , 0.06405511, 0.1035396 ,
       0.15918499, 0.03993592])

In [45]:
ds['Question'][2] #so above, we inferred Topic 3 is about earning money online, in which case this partly falls under it as it is about online access

'How can I increase the speed of my internet connection while using a VPN?'

In [44]:
topic_results[2].argmax() #using argmax() to grab the index position of the highest probability, which is index 2 in this case

3

In [47]:
ds['Topic'] = topic_results.argmax(axis=1) #adding the topic results as a column to the original pandas dataframe
ds

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,0
2,How can I increase the speed of my internet co...,3
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",1
...,...,...
404284,How many keywords are there in the Racket prog...,6
404285,Do you believe there is life after death?,4
404286,What is one coin?,11
404287,What is the approx annual cost of living while...,11


In [57]:
mytopic_dict = {0: 'Movie/Film', 1:'Student Life', 2:'Q&A',3:'Internet', 4:'Meaning of Life', 5:'India/China Politics', 6:'Programming', 7:'Presidential Election', 8:'Language', 9:'People', 10:'Fitness', 11:'Indian Economics'} #creating a dictionary for each topic
ds['Topic Label'] = ds['Topic'].map(mytopic_dict)
ds

Unnamed: 0,Question,Topic,Topic Label
0,What is the step by step guide to invest in sh...,5,India/China Politics
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,0,Movie/Film
2,How can I increase the speed of my internet co...,3,Internet
3,Why am I mentally very lonely? How can I solve...,11,Indian Economics
4,"Which one dissolve in water quikly sugar, salt...",1,Student Life
...,...,...,...
404284,How many keywords are there in the Racket prog...,6,Programming
404285,Do you believe there is life after death?,4,Meaning of Life
404286,What is one coin?,11,Indian Economics
404287,What is the approx annual cost of living while...,11,Indian Economics


In [None]:
|