In [1]:
'''This workbook uses NMF for topic modelling for over 400,000 Quora questions. 
Aim here is to find out what the topics are. User needs to pick number of topics.
K ODonnell 25/06/20 '''

import pandas as pd

In [3]:
quora = pd.read_csv('quora_questions.csv')

In [10]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [14]:
print(quora.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404289 entries, 0 to 404288
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Question  404289 non-null  object
dtypes: object(1)
memory usage: 3.1+ MB
None


In [None]:
# Df has over 400k entries, so will use Non-Negative Matrix Factorization

In [5]:
# Importing TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
tfidf = TfidfVectorizer(max_df=0.99, min_df=1, stop_words='english')

In [9]:
# Making document term matrix
dtm = tfidf.fit_transform(quora['Question'])

In [15]:
# Importing NMF
from sklearn.decomposition import NMF

In [16]:
# Setting 40 topics (say)
nmf_model = NMF(n_components=40)

In [17]:
# This takes a while...400k entries!
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=40, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [19]:
# Showing length of feature names
len(tfidf.get_feature_names())

67533

In [20]:
# Getting top 10 random words
import random
for i in range(10):
    random_word_id = random.randint(0,len(tfidf.get_feature_names()))
    print(tfidf.get_feature_names()[random_word_id])

shaggy
m1
ashrama
sematary
vadodara
widowmaker
ferraris
battled
nohara
hurricanes


In [21]:
# Number of topics
len(nmf_model.components_)

40

In [24]:
# Number of words in a topic 
len(nmf_model.components_[39])

67533

In [25]:
# Sinlge topic '0'
single_topic = nmf_model.components_[0]

In [30]:
# Top 10 word indeces for this topic 0:
single_topic.argsort()[-10:]

array([55244, 24528, 10891, 46231, 64387, 46244, 34721, 65132,  9740,
        8601])

In [31]:
# Index of largest words
top_word_indices = single_topic.argsort()[-10:]

In [32]:
# Top words for topic 0
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

site
friend
buy
place
visit
places
laptop
ways
book
best


In [33]:
# Getting top 20 words for each topic

for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 20 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 20 WORDS FOR TOPIC #0
['songs', 'course', 'company', 'sites', 'digital', 'smartphone', 'marketing', 'institute', 'delhi', 'coaching', 'site', 'friend', 'buy', 'place', 'visit', 'places', 'laptop', 'ways', 'book', 'best']


THE TOP 20 WORDS FOR TOPIC #1
['person', 'say', 'need', 'come', 'dream', 'says', 'affect', 'majors', 'universities', 'grads', 'recruit', 'looking', 'really', 'exist', 'compare', 'cost', 'long', 'feel', 'mean', 'does']


THE TOP 20 WORDS FOR TOPIC #2
['message', 'change', 'views', 'needing', 'users', 'picture', 'write', 'topics', 'improvement', 'profile', 'follow', 'writer', 'asked', 'add', 'post', 'delete', 'answers', 'answer', 'question', 'quora']


THE TOP 20 WORDS FOR TOPIC #3
['home', 'blog', 'dollars', 'faster', 'hair', 'video', 'interesting', 'million', 'fast', 'easy', 'great', '000', 'app', 'better', 'happy', 'month', 'youtube', 'friends', 'money', 'make']


THE TOP 20 WORDS FOR TOPIC #4
['human', 'positions', 'departments', 'living', 'planets', 'decis

['websites', 'seo', 'organic', 'commerce', 'promote', 'video', 'company', 'site', 'social', 'blog', 'cost', 'online', 'com', 'app', 'create', 'build', 'download', 'free', 'traffic', 'website']


THE TOP 20 WORDS FOR TOPIC #37
['site', 'person', '25', '22', 'size', 'speed', 'skipping', 'penis', '21', 'old', 'iq', 'possible', 'years', '20', 'blog', 'age', 'ways', 'traffic', 'height', 'increase']


THE TOP 20 WORDS FOR TOPIC #38
['grades', 'person', 'worse', 'health', 'plan', 'war', 'liar', 'choice', 'jail', 'debate', 'president', 'russia', 'hate', 'presidential', 'election', 'policy', 'vote', 'better', 'hillary', 'clinton']


THE TOP 20 WORDS FOR TOPIC #39
['3g', 'track', 'support', 'whatsapp', 'sentence', 'cell', 'apps', 'card', 'jio', 'sim', 'using', 'google', 'app', 'mobile', 'buy', 'iphone', 'android', 'number', 'phone', 'use']




In [34]:
# Attatch topic to original DF

In [36]:
quora['Topic'] = topic_results.argmax(axis=1)

In [39]:
print(quora.head())

                                            Question  Topic
0  What is the step by step guide to invest in sh...      9
1  What is the story of Kohinoor (Koh-i-Noor) Dia...     16
2  How can I increase the speed of my internet co...     37
3  Why am I mentally very lonely? How can I solve...      8
4  Which one dissolve in water quikly sugar, salt...     39
