In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
blogs_df = pd.read_csv("blog_posts_dataset.csv")

In [3]:
blogs_df.head(5)

Unnamed: 0.1,Unnamed: 0,author_id,sex,age,occupation,zodiac_sign,post
0,0,4162441,male,16,Student,Sagittarius,\n\n\t \n DESTINY... you might n...
1,1,4162441,male,16,Student,Sagittarius,\n\n\t \n DEAR ANGEL.. you say it...
2,2,4162441,male,16,Student,Sagittarius,\n\n\t \n MAIN AUR MERI TANHAI (jagjeet s...
3,3,4162441,male,16,Student,Sagittarius,\n\n\t \n mail addressrs(s) urlLink http...
4,4,4162441,male,16,Student,Sagittarius,\n\n\t \n RAP- ALLRISE so stand back caus...


In [4]:
blogs_df = blogs_df.drop(columns=["Unnamed: 0"])

In [5]:
blogs_df.head(5)

Unnamed: 0,author_id,sex,age,occupation,zodiac_sign,post
0,4162441,male,16,Student,Sagittarius,\n\n\t \n DESTINY... you might n...
1,4162441,male,16,Student,Sagittarius,\n\n\t \n DEAR ANGEL.. you say it...
2,4162441,male,16,Student,Sagittarius,\n\n\t \n MAIN AUR MERI TANHAI (jagjeet s...
3,4162441,male,16,Student,Sagittarius,\n\n\t \n mail addressrs(s) urlLink http...
4,4162441,male,16,Student,Sagittarius,\n\n\t \n RAP- ALLRISE so stand back caus...


In [6]:
blogs_df['post_clean'] = blogs_df['post'].apply(lambda post: gensim.utils.simple_preprocess(str(post), deacc=True))
blogs_df['post_clean'].head(5)

0    [destiny, you, might, not, say, anything, but,...
1    [dear, angel, you, say, it, or, you, don, but,...
2    [main, aur, meri, tanhai, jagjeet, singh, awar...
3    [mail, addressrs, urllink, http, rediff, com, ...
4    [rap, allrise, so, stand, back, cause, don, no...
Name: post_clean, dtype: object

In [7]:
data_words = blogs_df.post_clean.to_list()
data_words[:10]

[['destiny',
  'you',
  'might',
  'not',
  'say',
  'anything',
  'but',
  'can',
  'hear',
  'you',
  'have',
  'chosen',
  'me',
  'your',
  'life',
  'partner',
  'so',
  'have',
  'dear',
  'so',
  'have',
  'dear',
  'my',
  'first',
  'dream',
  'my',
  'first',
  'extreme',
  'my',
  'first',
  'love',
  'was',
  'waiting',
  'for',
  'my',
  'destiny',
  'what',
  'should',
  'do',
  'with',
  'myself',
  'tell',
  'me',
  'my',
  'heart',
  'what',
  'should',
  'do',
  'with',
  'myself',
  'tell',
  'me',
  'should',
  'fly',
  'with',
  'this',
  'beautiful',
  'nature',
  'or',
  'should',
  'play',
  'with',
  'these',
  'winds',
  'should',
  'try',
  'to',
  'reach',
  'the',
  'skies',
  'or',
  'should',
  'pray',
  'to',
  'the',
  'mother',
  'earth',
  'what',
  'should',
  'do',
  'with',
  'myself',
  'friends',
  'tell',
  'me',
  'she',
  'talked',
  'in',
  'such',
  'way',
  'gave',
  'me',
  'dreams',
  'with',
  'thousand',
  'colours',
  'like',
  'stand'

In [8]:
len(data_words)

659355

In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['destiny', 'you', 'might', 'not', 'say', 'anything', 'but', 'can', 'hear', 'you', 'have', 'chosen', 'me', 'your', 'life', 'partner', 'so', 'have', 'dear', 'so', 'have', 'dear', 'my', 'first', 'dream', 'my', 'first', 'extreme', 'my', 'first', 'love', 'was', 'waiting', 'for', 'my', 'destiny', 'what', 'should', 'do', 'with', 'myself', 'tell', 'me', 'my', 'heart', 'what', 'should', 'do', 'with', 'myself', 'tell', 'me', 'should', 'fly', 'with', 'this', 'beautiful', 'nature', 'or', 'should', 'play', 'with', 'these', 'winds', 'should', 'try', 'to', 'reach', 'the', 'skies', 'or', 'should', 'pray', 'to', 'the', 'mother', 'earth', 'what', 'should', 'do', 'with', 'myself', 'friends', 'tell', 'me', 'she', 'talked', 'in', 'such', 'way', 'gave', 'me', 'dreams', 'with', 'thousand', 'colours', 'like', 'stand', 'in', 'the', 'middle', 'of', 'island', 'and', 'she', 'shows', 'me', 'all', 'the', 'love', 'she', 'has', 'my', 'first', 'dream', 'my', 'first', 'extreme', 'my', 'first', 'love', 'was', 'waiting'

In [14]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

# Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [16]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


In [18]:
data_words_bigrams[0]

['destiny',
 'might',
 'say',
 'anything',
 'hear',
 'chosen',
 'life',
 'partner',
 'dear',
 'dear',
 'first',
 'dream',
 'first',
 'extreme',
 'first',
 'love',
 'waiting',
 'destiny',
 'tell',
 'heart',
 'tell',
 'fly',
 'beautiful',
 'nature',
 'play',
 'winds',
 'try',
 'reach',
 'skies',
 'pray',
 'mother',
 'earth',
 'friends',
 'tell',
 'talked',
 'way',
 'gave',
 'dreams',
 'thousand',
 'colours',
 'like',
 'stand',
 'middle',
 'island',
 'shows',
 'love',
 'first',
 'dream',
 'first',
 'extreme',
 'first',
 'love',
 'waiting',
 'destiny',
 'nil']

In [22]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['destiny', 'say', 'hear', 'choose', 'life', 'partner', 'dear', 'dear', 'first', 'dream', 'first', 'extreme', 'first', 'love', 'wait', 'destiny', 'tell', 'heart', 'tell', 'fly', 'beautiful', 'nature', 'play', 'wind', 'try', 'reach', 'sky', 'pray', 'mother', 'earth', 'friend', 'tell', 'talk', 'way', 'give', 'dream', 'colour', 'stand', 'middle', 'island', 'show', 'love', 'first', 'dream', 'first', 'extreme', 'first', 'love', 'wait', 'destiny', 'nil']]


In [23]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 3), (5, 3), (6, 1), (7, 2), (8, 6), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 3), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 3), (31, 1), (32, 2), (33, 1), (34, 1)]]


In [24]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('beautiful', 1),
  ('choose', 1),
  ('colour', 1),
  ('dear', 2),
  ('destiny', 3),
  ('dream', 3),
  ('earth', 1),
  ('extreme', 2),
  ('first', 6),
  ('fly', 1),
  ('friend', 1),
  ('give', 1),
  ('hear', 1),
  ('heart', 1),
  ('island', 1),
  ('life', 1),
  ('love', 3),
  ('middle', 1),
  ('mother', 1),
  ('nature', 1),
  ('nil', 1),
  ('partner', 1),
  ('play', 1),
  ('pray', 1),
  ('reach', 1),
  ('say', 1),
  ('show', 1),
  ('sky', 1),
  ('stand', 1),
  ('talk', 1),
  ('tell', 3),
  ('try', 1),
  ('wait', 2),
  ('way', 1),
  ('wind', 1)]]

In [26]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [27]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10000,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [28]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.073*"song" + 0.061*"music" + 0.040*"band" + 0.035*"play" + 0.024*"listen" '
  '+ 0.019*"rock" + 0.018*"sing" + 0.015*"dance" + 0.015*"album" + '
  '0.013*"hear"'),
 (1,
  '0.033*"get" + 0.022*"think" + 0.020*"people" + 0.019*"really" + 0.019*"say" '
  '+ 0.016*"go" + 0.014*"good" + 0.014*"guy" + 0.014*"look" + 0.013*"make"'),
 (2,
  '0.091*"game" + 0.066*"play" + 0.036*"team" + 0.025*"win" + 0.016*"player" + '
  '0.013*"season" + 0.012*"sport" + 0.012*"ball" + 0.012*"year" + '
  '0.011*"first"'),
 (3,
  '0.093*"book" + 0.085*"read" + 0.066*"write" + 0.030*"story" + 0.017*"word" '
  '+ 0.014*"page" + 0.013*"paper" + 0.012*"writer" + 0.012*"art" + '
  '0.011*"letter"'),
 (4,
  '0.102*"school" + 0.060*"class" + 0.027*"student" + 0.025*"test" + '
  '0.023*"study" + 0.023*"teacher" + 0.022*"year" + 0.016*"high" + '
  '0.014*"college" + 0.014*"grade"'),
 (5,
  '0.038*"eat" + 0.027*"wear" + 0.024*"food" + 0.023*"hair" + 0.015*"black" + '
  '0.014*"color" + 0.012*"shirt" + 0.011*"sho

In [29]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.731647551098098

Coherence Score:  0.4955674542526249


In [30]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [31]:
lda_model_30 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10000,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [34]:
pprint(lda_model_30.print_topics())
doc_lda = lda_model[corpus]

[(5,
  '0.142*"ring" + 0.141*"power" + 0.039*"model" + 0.028*"amy" + 0.021*"frog" + '
  '0.021*"thread" + 0.018*"exhibit" + 0.013*"manual" + 0.012*"friendster" + '
  '0.011*"auto"'),
 (15,
  '0.165*"run" + 0.059*"race" + 0.030*"mile" + 0.029*"training" + 0.020*"fast" '
  '+ 0.019*"track" + 0.016*"train" + 0.013*"running" + 0.013*"swim" + '
  '0.013*"speed"'),
 (25,
  '0.023*"human" + 0.021*"earth" + 0.020*"plant" + 0.020*"animal" + '
  '0.018*"monkey" + 0.017*"space" + 0.016*"planet" + 0.013*"oil" + '
  '0.011*"garden" + 0.010*"energy"'),
 (27,
  '0.036*"doctor" + 0.030*"weight" + 0.029*"body" + 0.025*"hospital" + '
  '0.022*"pain" + 0.021*"drug" + 0.017*"health" + 0.014*"fat" + '
  '0.013*"patient" + 0.013*"blood"'),
 (10,
  '0.044*"church" + 0.022*"faith" + 0.021*"pray" + 0.018*"religion" + '
  '0.013*"prayer" + 0.013*"believe" + 0.012*"sin" + 0.011*"give" + 0.010*"man" '
  '+ 0.010*"christian"'),
 (28,
  '0.041*"use" + 0.034*"computer" + 0.020*"system" + 0.016*"file" + '
  '0.014*"i

In [32]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_30.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_30 = CoherenceModel(model=lda_model_30, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda_30 = coherence_model_lda_30.get_coherence()
print('\nCoherence Score: ', coherence_lda_30)


Perplexity:  -9.17272438782127

Coherence Score:  0.5240132013321268


In [33]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_30, corpus, id2word)
vis

In [35]:
lda_model_10 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10000,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [37]:
pprint(lda_model.print_topics())
doc_lda = lda_model_10[corpus]

[(0,
  '0.073*"song" + 0.061*"music" + 0.040*"band" + 0.035*"play" + 0.024*"listen" '
  '+ 0.019*"rock" + 0.018*"sing" + 0.015*"dance" + 0.015*"album" + '
  '0.013*"hear"'),
 (1,
  '0.033*"get" + 0.022*"think" + 0.020*"people" + 0.019*"really" + 0.019*"say" '
  '+ 0.016*"go" + 0.014*"good" + 0.014*"guy" + 0.014*"look" + 0.013*"make"'),
 (2,
  '0.091*"game" + 0.066*"play" + 0.036*"team" + 0.025*"win" + 0.016*"player" + '
  '0.013*"season" + 0.012*"sport" + 0.012*"ball" + 0.012*"year" + '
  '0.011*"first"'),
 (3,
  '0.093*"book" + 0.085*"read" + 0.066*"write" + 0.030*"story" + 0.017*"word" '
  '+ 0.014*"page" + 0.013*"paper" + 0.012*"writer" + 0.012*"art" + '
  '0.011*"letter"'),
 (4,
  '0.102*"school" + 0.060*"class" + 0.027*"student" + 0.025*"test" + '
  '0.023*"study" + 0.023*"teacher" + 0.022*"year" + 0.016*"high" + '
  '0.014*"college" + 0.014*"grade"'),
 (5,
  '0.038*"eat" + 0.027*"wear" + 0.024*"food" + 0.023*"hair" + 0.015*"black" + '
  '0.014*"color" + 0.012*"shirt" + 0.011*"sho

In [36]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_10.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_10 = CoherenceModel(model=lda_model_10, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda_10 = coherence_model_lda_10.get_coherence()
print('\nCoherence Score: ', coherence_lda_10)


Perplexity:  -8.292739346233734

Coherence Score:  0.4202358234135072


In [38]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_10, corpus, id2word)
vis

In [54]:
def format_topics_sentences(ldamodel, post):
    # Get main topic in each post
    post_bow = ldamodel[id2word.doc2bow(post)]
    row = sorted(post_bow[0], key=lambda x: x[1], reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
    # for j, (topic_num, prop_topic) in enumerate(row):
    #     topics = []
    #     if j == 0:  # => dominant topic
    topic_num = row[0][0]
    prop_topic = row[0][1]
    wp = ldamodel.show_topic(topic_num)
    topic_keywords = ", ".join([word for word, prop in wp])
    return int(topic_num), round(prop_topic,4), topic_keywords
    


In [58]:
blogs_df['dominant_topic'], blogs_df['perc_contribution'], blogs_df['topic_keywords'] = zip(*blogs_df['post_clean'].apply(lambda post: format_topics_sentences(lda_model_30, post)))


In [59]:
blogs_df.to_csv('post_with_topics.csv')

In [60]:
blogs_df.head(10)

Unnamed: 0,author_id,sex,age,occupation,zodiac_sign,post,post_clean,dominant_topic,perc_contribution,topic_keywords
0,4162441,male,16,Student,Sagittarius,\n\n\t \n DESTINY... you might n...,"[destiny, you, might, not, say, anything, but,...",7,0.3328,"think, know, say, thing, make, get, want, real..."
1,4162441,male,16,Student,Sagittarius,\n\n\t \n DEAR ANGEL.. you say it...,"[dear, angel, you, say, it, or, you, don, but,...",7,0.3392,"think, know, say, thing, make, get, want, real..."
2,4162441,male,16,Student,Sagittarius,\n\n\t \n MAIN AUR MERI TANHAI (jagjeet s...,"[main, aur, meri, tanhai, jagjeet, singh, awar...",14,0.2518,"people, many, become, world, make, point, also..."
3,4162441,male,16,Student,Sagittarius,\n\n\t \n mail addressrs(s) urlLink http...,"[mail, addressrs, urllink, http, rediff, com, ...",16,0.4209,"post, blog, site, comment, new, link, com, pic..."
4,4162441,male,16,Student,Sagittarius,\n\n\t \n RAP- ALLRISE so stand back caus...,"[rap, allrise, so, stand, back, cause, don, no...",7,0.3128,"think, know, say, thing, make, get, want, real..."
5,4162441,male,16,Student,Sagittarius,\n\n\t \n MISSING YOU BADLY. i am lonel...,"[missing, you, badly, am, lonely, here, search...",7,0.3192,"think, know, say, thing, make, get, want, real..."
6,4162441,male,16,Student,Sagittarius,\n\n\t \n HAZEL EYES. close ...,"[hazel, eyes, close, your, eyes, and, imagine,...",7,0.307,"think, know, say, thing, make, get, want, real..."
7,4162441,male,16,Student,Sagittarius,\n\n\t \n LET IT BE ME. a bird hibe...,"[let, it, be, me, bird, hibernated, for, year,...",7,0.2711,"think, know, say, thing, make, get, want, real..."
8,3489929,female,25,Student,Cancer,"\n\n\t \n It's been a long time coming, b...","[it, been, long, time, coming, but, have, made...",7,0.3275,"think, know, say, thing, make, get, want, real..."
9,3489929,female,25,Student,Cancer,\n\n\t \n urlLink \n,[urllink],7,0.2762,"think, know, say, thing, make, get, want, real..."


In [62]:
blogs_zodiac_df = blogs_df.groupby('zodiac_sign')

In [68]:
blogs_zodiac_df_list = [blogs_zodiac_df.get_group(x) for x in blogs_zodiac_df.groups]

In [75]:
zodiac_topic = []
for zodiac_post in blogs_zodiac_df_list:
    posts = np.hstack(zodiac_post.post_clean)
    zodiac = zodiac_post['zodiac_sign'].values[0]
    dominant_topic, perc_contribution, topic_keywords = format_topics_sentences(lda_model_30, posts)
    zodiac_topic.append([zodiac, dominant_topic, perc_contribution, topic_keywords])
zodiac_topic_df = pd.DataFrame(zodiac_topic, columns=['zodiac_sign', 'dominant_topic', 'perc_contribution', 'topic_keywords'])

In [76]:
zodiac_topic_df.to_csv('zodiac_topics.csv')

In [77]:
zodiac_topic_df

Unnamed: 0,zodiac_sign,dominant_topic,perc_contribution,topic_keywords
0,Aquarius,7,0.3538,"think, know, say, thing, make, get, want, real..."
1,Aries,7,0.3581,"think, know, say, thing, make, get, want, real..."
2,Cancer,7,0.3561,"think, know, say, thing, make, get, want, real..."
3,Capricorn,7,0.3595,"think, know, say, thing, make, get, want, real..."
4,Gemini,7,0.3584,"think, know, say, thing, make, get, want, real..."
5,Leo,7,0.3596,"think, know, say, thing, make, get, want, real..."
6,Libra,7,0.3612,"think, know, say, thing, make, get, want, real..."
7,Pisces,7,0.3562,"think, know, say, thing, make, get, want, real..."
8,Sagittarius,7,0.3509,"think, know, say, thing, make, get, want, real..."
9,Scorpio,7,0.3592,"think, know, say, thing, make, get, want, real..."


In [6]:
blogs_zodiac_df= pd.read_csv('processed_data/post_with_topics.csv')
blogs_zodiac_df.head(10)

Unnamed: 0.1,Unnamed: 0,author_id,sex,age,occupation,zodiac_sign,post,post_clean,dominant_topic,perc_contribution,topic_keywords
0,0,4162441,male,16,Student,Sagittarius,\n\n\t \n DESTINY... you might n...,"['destiny', 'you', 'might', 'not', 'say', 'any...",7,0.3328,"think, know, say, thing, make, get, want, real..."
1,1,4162441,male,16,Student,Sagittarius,\n\n\t \n DEAR ANGEL.. you say it...,"['dear', 'angel', 'you', 'say', 'it', 'or', 'y...",7,0.3392,"think, know, say, thing, make, get, want, real..."
2,2,4162441,male,16,Student,Sagittarius,\n\n\t \n MAIN AUR MERI TANHAI (jagjeet s...,"['main', 'aur', 'meri', 'tanhai', 'jagjeet', '...",14,0.2518,"people, many, become, world, make, point, also..."
3,3,4162441,male,16,Student,Sagittarius,\n\n\t \n mail addressrs(s) urlLink http...,"['mail', 'addressrs', 'urllink', 'http', 'redi...",16,0.4209,"post, blog, site, comment, new, link, com, pic..."
4,4,4162441,male,16,Student,Sagittarius,\n\n\t \n RAP- ALLRISE so stand back caus...,"['rap', 'allrise', 'so', 'stand', 'back', 'cau...",7,0.3128,"think, know, say, thing, make, get, want, real..."
5,5,4162441,male,16,Student,Sagittarius,\n\n\t \n MISSING YOU BADLY. i am lonel...,"['missing', 'you', 'badly', 'am', 'lonely', 'h...",7,0.3192,"think, know, say, thing, make, get, want, real..."
6,6,4162441,male,16,Student,Sagittarius,\n\n\t \n HAZEL EYES. close ...,"['hazel', 'eyes', 'close', 'your', 'eyes', 'an...",7,0.307,"think, know, say, thing, make, get, want, real..."
7,7,4162441,male,16,Student,Sagittarius,\n\n\t \n LET IT BE ME. a bird hibe...,"['let', 'it', 'be', 'me', 'bird', 'hibernated'...",7,0.2711,"think, know, say, thing, make, get, want, real..."
8,8,3489929,female,25,Student,Cancer,"\n\n\t \n It's been a long time coming, b...","['it', 'been', 'long', 'time', 'coming', 'but'...",7,0.3275,"think, know, say, thing, make, get, want, real..."
9,9,3489929,female,25,Student,Cancer,\n\n\t \n urlLink \n,['urllink'],7,0.2762,"think, know, say, thing, make, get, want, real..."


In [9]:
blogs_zodiac_group_df = blogs_zodiac_df.groupby('zodiac_sign')
blogs_zodiac_df_list = [blogs_zodiac_group_df.get_group(x) for x in blogs_zodiac_group_df.groups]

In [19]:
for zodiac_post in blogs_zodiac_df_list:
    topic_counts = zodiac_post.groupby('dominant_topic').count()
    print(zodiac_post['zodiac_sign'].values[0], zodiac_post.shape[0], topic_counts.post.sum())
    top_5 = topic_counts.sort_values('post',ascending = False).head(5)
    print(top_5.post)
    # print(topic_counts)

Aquarius 49172 49172
dominant_topic
7     43571
12     2173
14      716
8       585
16      321
Name: post, dtype: int64
Aries 64223 64223
dominant_topic
7     57168
12     3022
14      839
16      556
11      332
Name: post, dtype: int64
Cancer 61396 61396
dominant_topic
7     54065
12     2785
14      845
11      463
8       424
Name: post, dtype: int64
Capricorn 47001 47001
dominant_topic
7     41809
12     2494
14      468
16      316
21      257
Name: post, dtype: int64
Gemini 49204 49204
dominant_topic
7     43920
12     2525
14      464
16      324
21      272
Name: post, dtype: int64
Leo 53270 53270
dominant_topic
7     47612
12     2688
14      525
16      340
8       273
Name: post, dtype: int64
Libra 58419 58419
dominant_topic
7     52081
12     3242
14      667
21      390
16      379
Name: post, dtype: int64
Pisces 52511 52511
dominant_topic
7     46776
12     2462
14      592
16      492
21      289
Name: post, dtype: int64
Sagittarius 48859 48859
dominant_topic
7     432

In [14]:
blogs_zodiac_df[blogs_zodiac_df['zodiac_sign'] == 'Aquarius'].count()

Unnamed: 0           49172
author_id            49172
sex                  49172
age                  49172
occupation           49172
zodiac_sign          49172
post                 49172
post_clean           49172
dominant_topic       49172
perc_contribution    49172
topic_keywords       49172
dtype: int64

In [22]:
blogs_topic_keywords_df = blogs_zodiac_df.groupby(['dominant_topic','topic_keywords']).size().reset_index().rename(columns={0:'count'})

In [23]:
blogs_topic_keywords_df.to_csv('topics_keywords_df.csv')

In [24]:
blogs_zodiac_df.shape

(659355, 11)

In [25]:
blogs_zodiac_df.head(5)

Unnamed: 0.1,Unnamed: 0,author_id,sex,age,occupation,zodiac_sign,post,post_clean,dominant_topic,perc_contribution,topic_keywords
0,0,4162441,male,16,Student,Sagittarius,\n\n\t \n DESTINY... you might n...,"['destiny', 'you', 'might', 'not', 'say', 'any...",7,0.3328,"think, know, say, thing, make, get, want, real..."
1,1,4162441,male,16,Student,Sagittarius,\n\n\t \n DEAR ANGEL.. you say it...,"['dear', 'angel', 'you', 'say', 'it', 'or', 'y...",7,0.3392,"think, know, say, thing, make, get, want, real..."
2,2,4162441,male,16,Student,Sagittarius,\n\n\t \n MAIN AUR MERI TANHAI (jagjeet s...,"['main', 'aur', 'meri', 'tanhai', 'jagjeet', '...",14,0.2518,"people, many, become, world, make, point, also..."
3,3,4162441,male,16,Student,Sagittarius,\n\n\t \n mail addressrs(s) urlLink http...,"['mail', 'addressrs', 'urllink', 'http', 'redi...",16,0.4209,"post, blog, site, comment, new, link, com, pic..."
4,4,4162441,male,16,Student,Sagittarius,\n\n\t \n RAP- ALLRISE so stand back caus...,"['rap', 'allrise', 'so', 'stand', 'back', 'cau...",7,0.3128,"think, know, say, thing, make, get, want, real..."
