In [1]:
# Udemy project but extended for more experimentation and investigation

In [3]:
import pandas as pd
quora = pd.read_csv('quora_questions.csv') # Example dataset used in the project

In [5]:
quora.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404289 entries, 0 to 404288
Data columns (total 1 columns):
Question    404289 non-null object
dtypes: object(1)
memory usage: 3.1+ MB


In [6]:
quora.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [7]:
# Dependencies
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


## Basic case - just default cv and 10 components

In [8]:
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [13]:
# countVec - countvectorizer
# n - number of components

def LDA_generator(countVec, n):
    dtm = countVec.fit_transform(quora['Question'])
    # initialize LDA
    LDA = LatentDirichletAllocation(n_components=n,random_state=42)
    LDA.fit(dtm)
    return LDA

In [15]:
LDA_basic = LDA_generator(cv,10) # this takes a while to run

In [16]:
LDA_basic

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [20]:
def topic_generator(LDA,countVec):
    # Now, we can do this for each topoic
    for i, topic in enumerate(LDA.components_):
        print(f"The TOP 10 words for each topic #{i}")
        print([countVec.get_feature_names()[index] for index in topic.argsort()[-10:]])
        print('\n')
        print('\n')
        print('\n')

In [21]:
topic_generator(LDA_basic,cv)

The TOP 10 words for each topic #0
['android', 'good', 'career', 'google', 'difference', 'software', 'engineering', 'examples', 'best', 'does']






The TOP 10 words for each topic #1
['rs', 'india', 'think', 'english', 'black', 'stop', 'indian', '1000', 'notes', '500']






The TOP 10 words for each topic #2
['universe', 'compare', 'water', 'did', 'average', 'energy', 'good', 'time', 'does', 'life']






The TOP 10 words for each topic #3
['ask', 'day', 'question', 'movie', 'things', 'questions', 'know', 'new', 'people', 'quora']






The TOP 10 words for each topic #4
['women', 'country', 'differences', 'rid', 'password', 'college', 'difference', 'car', 'india', 'job']






The TOP 10 words for each topic #5
['war', 'work', 'sex', 'long', 'did', 'feel', 'mean', 'world', 'like', 'does']






The TOP 10 words for each topic #6
['days', 'difference', 'school', 'science', 'study', 'live', 'good', 'lose', 'weight', 'best']






The TOP 10 words for each topic #7
['donald', 'start',

## More components but greatly reducing the number of words based on frequency

In [31]:
cv_2 = CountVectorizer(max_df=0.9, min_df=5, stop_words='english')

In [32]:
LDA_2 = LDA_generator(cv_2,20) # this takes a while to run

In [33]:
LDA_2

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=20, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [34]:
topic_generator(LDA_2,cv_2)

The TOP 10 words for each topic #0
['boyfriend', 'friend', 'guy', 'like', 'big', 'effects', 'girlfriend', 'high', 'iphone', 'person']






The TOP 10 words for each topic #1
['universities', 'looking', 'india', 'new', 'does', 'earn', 'online', 'money', 'difference', 'make']






The TOP 10 words for each topic #2
['parents', 'china', 'like', 'education', 'programming', 'website', 'language', 'relationship', 'india', 'indian']






The TOP 10 words for each topic #3
['video', 'hair', 'laptop', 'good', 'phone', 'free', 'buy', 'way', 'learn', 'best']






The TOP 10 words for each topic #4
['living', 'usa', 'purpose', 'war', 'happen', 'good', 'pakistan', 'meaning', 'india', 'life']






The TOP 10 words for each topic #5
['fall', 'terms', 'getting', 'state', 'better', 'differences', 'chinese', 'friends', 'school', 'love']






The TOP 10 words for each topic #6
['sentence', 'bad', 'card', 'history', 'average', 'social', 'word', 'examples', 'used', 'use']






The TOP 10 words for e

In [40]:
# alternatively, highlighting a keyword or two/summary
topic_categorization_dict = {0: 'Relationship', 1: 'Earning', 2: 'Education',\
                             3: 'Internet Search', 4: 'Purpose', 5: 'Fall Semester', 6: 'History',\
                            7: 'Politics', 8: 'Social Network', 9: 'Business', 10: 'Safety',\
                            11: 'Essentials', 12: 'Networks', 13: 'Travel', 14: 'Corporations',\
                            15: 'Management', 16: 'Dating', 17: 'Wealth', 18: 'Weight Loss',\
                            19: 'Questions'}

In [41]:
quora['Topic Label'] = quora['Topic'].map(topic_categorization_dict)
quora.head()

Unnamed: 0,Question,Topic,Topic Label
0,What is the step by step guide to invest in sh...,4,Purpose
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,9,Business
2,How can I increase the speed of my internet co...,3,Internet Search
3,Why am I mentally very lonely? How can I solve...,1,Earning
4,"Which one dissolve in water quikly sugar, salt...",3,Internet Search
