In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./quora_questions.csv')

In [3]:
df

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."
...,...
404284,How many keywords are there in the Racket prog...
404285,Do you believe there is life after death?
404286,What is one coin?
404287,What is the approx annual cost of living while...


In [4]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [6]:
dtm = tfidf.fit_transform(df['Question'])

In [10]:
len(tfidf.get_feature_names())

38669

In [11]:
model = NMF(n_components=20, random_state=42)

In [12]:
model.fit(dtm)



NMF(n_components=20, random_state=42)

In [14]:
len(model.components_)

20

In [15]:
model.components_.shape

(20, 38669)

In [18]:
model.components_[0].argsort()

array([    0, 22613, 22611, ...,  5268, 22925,  4632], dtype=int64)

In [19]:
for index, topic in enumerate(model.components_):
    print(f"The top 15 words for topic #{index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


The top 15 words for topic #1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


The top 15 words for topic #2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


The top 15 words for topic #3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


The top 15 words for topic #4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


The top 15 words for topic #5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 

In [20]:
topic_result = model.transform(dtm)

In [21]:
df['topic'] = topic_result.argmax(axis=1)

In [22]:
df

Unnamed: 0,Question,topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14
...,...,...
404284,How many keywords are there in the Racket prog...,6
404285,Do you believe there is life after death?,4
404286,What is one coin?,11
404287,What is the approx annual cost of living while...,11
