In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('quora_questions.csv')

In [3]:
data.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [4]:
len(data)

404289

In [5]:
data['Question'][99]

'Why is the number for Skype at 1-855-425-3768 always busy?'

In [6]:
#Pre-processing of data
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tdfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')

In [9]:
dtm = tdfidf.fit_transform(data['Question'])

In [12]:
dtm.shape

(404289, 38669)

In [13]:
#LDA

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

In [16]:
LDA = LatentDirichletAllocation(n_components=5,random_state=101)

In [17]:
LDA.fit(dtm)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=5, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=101,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [19]:
#To see words
len(tdfidf.get_feature_names())

38669

In [20]:
import random
for i in range(10):
    random_words = random.randint(0,38669)
    print(tdfidf.get_feature_names()[random_words])

meta
cloudy
conjugate
fever
proxima
free
maggi
unrequited
occur
apk


In [21]:
#To see top words per topic
LDA.components_

array([[2.01534850e-01, 2.02887086e-01, 2.02769149e-01, ...,
        2.00061373e-01, 2.00064275e-01, 2.00061373e-01],
       [2.02446040e-01, 2.01683079e-01, 2.00044891e-01, ...,
        2.00067158e-01, 2.00069646e-01, 2.00067158e-01],
       [2.06472185e-01, 2.03912739e-01, 1.26618094e+00, ...,
        2.00065737e-01, 2.72100657e+00, 2.00065737e-01],
       [2.05465034e-01, 2.98637610e+02, 2.00044018e-01, ...,
        2.00064961e-01, 2.00067686e-01, 2.00064961e-01],
       [1.82850666e+01, 2.04687316e-01, 2.00046359e-01, ...,
        1.80975329e+00, 2.00066531e-01, 1.80975329e+00]])

In [22]:
single_topic = LDA.components_[0] #Topic - 1

In [23]:
single_topic.argsort()

array([13236, 19780, 19035, ..., 19489,  4632, 28046], dtype=int64)

In [24]:
top_words_index = single_topic.argsort()[-10:]

In [25]:
top_words_index

array([20727, 32870, 38026, 22673, 24561, 21116, 37515, 19489,  4632,
       28046], dtype=int64)

In [26]:
for index in top_words_index:
    print(tdfidf.get_feature_names()[index])

love
start
work
money
online
make
way
know
best
quora


In [27]:
#Check all top words per topic
for i,topic in enumerate(LDA.components_):
    print(f"Top words in topic-{i}")
    print([tdfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print("\n")
    print("\n")

Top words in topic-0
['ways', 'learn', 'year', 'questions', 'improve', 'love', 'start', 'work', 'money', 'online', 'make', 'way', 'know', 'best', 'quora']




Top words in topic-1
['com', 'long', 'happen', 'learning', 'women', 'difference', 'prepare', 'india', 'war', 'like', 'really', 'sex', 'job', 'does', 'time']




Top words in topic-2
['india', 'black', 'need', 'earn', 'notes', '1000', '500', 'person', 'donald', 'lose', 'feel', 'weight', 'mean', 'trump', 'does']




Top words in topic-3
['examples', 'instagram', 'movie', 'movies', 'programming', 'language', 'think', 'thing', 'day', 'things', '2016', 'people', 'world', 'best', 'life']




Top words in topic-4
['used', 'read', 'company', 'good', 'college', 'india', 'app', 'android', 'does', 'change', 'important', 'buy', 'engineering', 'use', 'best']






In [28]:
topic_results = LDA.transform(dtm)

In [29]:
data['Topic'] = topic_results.argmax(axis=1)

In [31]:
data

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,1
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,2
2,How can I increase the speed of my internet co...,0
3,Why am I mentally very lonely? How can I solve...,4
4,"Which one dissolve in water quikly sugar, salt...",2
5,Astrology: I am a Capricorn Sun Cap moon and c...,2
6,Should I buy tiago?,4
7,How can I be a good geologist?,4
8,When do you use シ instead of し?,3
9,Motorola (company): Can I hack my Charter Moto...,4
