In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
ds = pd.read_csv('quora_questions.csv')

ds.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


# **Step 1: Create Model**

In [3]:
ds['Question'][117] #seeing a sample question
#also yes 1984 was depressing lol

'Did you find the ending of the novel "1984" depressing?'

In [4]:
len(ds) #we have 404289 articles in this dataset

404289

In [5]:
cv = CountVectorizer(max_df=0.95, min_df = 2, stop_words = 'english') #so max_df allows us to ignore certain tokens that have really high document frequency, so in our case we're saying ignore tokens that show up in 95% of documents
#min_df is the opposite, where we're saying a token has to show up at least in 2 different documents. If we selected a number between 0 and 1, it would've taken it as a percentage of the total number of documents instead, just like we did in max_df
#stop words just removes common words like the, a, etc

In [6]:
dtm = cv.fit_transform(ds['Question']) #Document Term Matrix, which counts each occurrence of each unique word throughout every single document (each text message) and add it to a DTM.
dtm #we got a sparse matrix with 38669 unique words across the 404289 questions

<404289x38669 sparse matrix of type '<class 'numpy.int64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [7]:
LDA = LatentDirichletAllocation(n_components = 12) #n_components = 12 means we want 12 general topics returned,

LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=12, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

# **Step 2: Grab the Vocabulary of Words**

In [15]:
print("features =",len(cv.get_feature_names())) #so we got 38669 features, which is just every single word
print("type =", type(cv.get_feature_names())) #we can see it's just a list of all our feature names. So since it's just a list, we can just grab index positions off the list to get the words
print("index word 2401 is", cv.get_feature_names()[2401]) #grabbing index 2401

features = 38669
type = <class 'list'>
index word 2401 is allusions


# **Step 3: Topics**

In [9]:
len(LDA.components_) #there are 12 components, which are the topics

12

In [10]:
type(LDA.components_) #so it's just a numpy array, so we can do numpy manipulations to it

numpy.ndarray

In [11]:
LDA.components_.shape #so it's 12 topics by 38669 words

(12, 38669)

# **Step 4: Grab the Highest Probability Words per Topic**

In [17]:
for index, topic in enumerate(LDA.components_):
  print(f"THE TOP 15 WORDS FOR TOPIC #{index}")
  print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
  print('\n')
  print('\n')
  #looks like topic 0 is about education, topic 2 is about social media, topic 7 is about Engineering/Science, etc

THE TOP 15 WORDS FOR TOPIC #0
['high', 'series', 'year', 'did', 'pay', 'better', 'watch', 'university', 'best', 'tv', 'good', 'school', 'college', 'mean', 'does']




THE TOP 15 WORDS FOR TOPIC #1
['english', 'non', 'speak', '000', 'does', 'products', 'idea', 'test', 'exam', 'laptop', 'india', 'iphone', 'prepare', 'best', 'buy']




THE TOP 15 WORDS FOR TOPIC #2
['hack', 'person', 'used', 'think', 'say', 'does', 'whatsapp', 'stop', 'car', 'facebook', 'earth', 'word', 'did', 'instagram', 'people']




THE TOP 15 WORDS FOR TOPIC #3
['skills', 'rupee', 'black', 'does', 'money', 'rs', 'indian', 'ways', 'improve', 'english', '1000', 'notes', '500', 'best', 'way']




THE TOP 15 WORDS FOR TOPIC #4
['marketing', 'usa', 'pakistan', 'place', 'countries', 'china', 'like', 'study', 'best', 'company', 'country', 'live', 'war', 'india', 'world']




THE TOP 15 WORDS FOR TOPIC #5
['height', 'number', 'new', 'purpose', 'years', 'meaning', 'math', '2016', 'start', 'movies', 'think', 'business', 'best'

# **Step 6: Attached Topic Numbers to the Articles**

In [18]:
topic_results = LDA.transform(dtm) #applying a transform to our LDA using dtm

topic_results.shape #so we have the original 404289 questions along wiht 12 topics

(404289, 12)

In [20]:
topic_results[117]*100 #the array of index 0 is showing the probability of a document (117th question in this case) belonging to a particular topic. We can see the topic 0 is most likely at 50.1%

array([50.13827443,  1.38889054,  1.38892962,  1.3888999 ,  1.38891572,
        1.38895616,  1.3889372 ,  1.38889614,  1.38904399,  1.3888923 ,
        1.38888893, 35.97247506])

In [22]:
ds['Question'][117] #so above, we inferred Topic 0 is about education, in which case this is somewhat along those lines as 1984 is a popular book read in classrooms

'Did you find the ending of the novel "1984" depressing?'

In [23]:
topic_results[117].argmax() #using argmax() to grab the index position of the highest probability, which is index 0 in this case

0

In [24]:
ds['Topic'] = topic_results.argmax(axis=1) #adding the topic results as a column to the original pandas dataframe
ds

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,3
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8
2,How can I increase the speed of my internet co...,9
3,Why am I mentally very lonely? How can I solve...,2
4,"Which one dissolve in water quikly sugar, salt...",10
...,...,...
404284,How many keywords are there in the Racket prog...,9
404285,Do you believe there is life after death?,2
404286,What is one coin?,2
404287,What is the approx annual cost of living while...,4


In [26]:
mytopic_dict = {0: 'Education', 1:'Exams', 2:'Social Media',3:'English Language', 4:'India/China Politics', 5:'New Year', 6:'Relationship', 7:'STEM', 8:'Election', 9:'Internet', 10:'Fitness', 11:'Entertainment'} #creating a dictionary for each topic
ds['Topic Label'] = ds['Topic'].map(mytopic_dict)
ds

Unnamed: 0,Question,Topic,Topic Label
0,What is the step by step guide to invest in sh...,3,English Language
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,8,Election
2,How can I increase the speed of my internet co...,9,Internet
3,Why am I mentally very lonely? How can I solve...,2,Social Media
4,"Which one dissolve in water quikly sugar, salt...",10,Fitness
...,...,...,...
404284,How many keywords are there in the Racket prog...,9,Internet
404285,Do you believe there is life after death?,2,Social Media
404286,What is one coin?,2,Social Media
404287,What is the approx annual cost of living while...,4,India/China Politics
