In [1]:
import pandas as pd
import numpy as np
import scipy as sp

import sklearn

import re
import pickle

import sys
import warnings

import nltk
import gensim.corpora

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import ldamodel,LsiModel
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

warnings.filterwarnings('ignore')
#nltk.download('stopwords')
#nltk.download('wordnet')

## Exploration

In [2]:
filename = 'data/raw_da_qs.csv'

In [3]:
data_total = pd.read_csv(filename)

list(data_total.columns)

['year', 'month', 'day', 'url', 'title', 'letterId', 'question_only']

In [4]:
def group_textOfSameTitle(df=data_total):
    res = df.groupby(["title"])['year', 'month', 'day', 'url','letterId', 'question_only']\
            .agg(lambda x: ','.join(x.astype(str)))\
            .sort_values('year')\
            .reset_index()
    return res

In [5]:
data_total = group_textOfSameTitle()

data_titles = data_total[['title']]

display(data_titles.head(8))

Unnamed: 0,title
0,A POINTED REMINDER ABOUT HOW CROWDED IT'S GETTING
1,AMERICAN SMOKEOUT DAY CHANCE TO SNUFF OUT HABIT
2,HOW DID 'PEGGY' BECOME A NICKNAME FOR 'MARGARET'?
3,HUSBAND CAN'T STOMACH WIFE'S VOW TO DONATE ORGANS
4,LATE ANSWER FOR EARLY GUEST
5,"BLOOD BANKS SAVE LIVES, NOT BLOOD"
6,TALK OF LONG WALKS WEARS THIN WITH AGE
7,HOLY COW! SOME READERS TAKE A BULL BY THE HORNS


In [6]:
data_titles = data_titles.astype('str')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '#', '...', 
                   '--', "'s", 'also', '&', '-', '–', '=', 'known', 'mi', 'km', '$'])

for idx in range(len(data_titles)):
    data_titles.iloc[idx]['title'] = [lemmatizer.lemmatize(word) for word in data_titles.iloc[idx]['title'].split(' ') if word not in stopwords.words()] #remove stopwords

pickle.dump(data_titles,
            open('data_questions.data', 
                 'wb')
            )

#tokenization
titles = [val[0] for val in data_titles.iloc[0:].values]

In [7]:
[val[0] for val in data_titles.iloc[0:].values]

[['A', 'POINTED', 'REMINDER', 'ABOUT', 'HOW', 'CROWDED', "IT'S", 'GETTING'],
 ['AMERICAN', 'SMOKEOUT', 'DAY', 'CHANCE', 'TO', 'SNUFF', 'OUT', 'HABIT'],
 ['HOW', 'DID', "'PEGGY'", 'BECOME', 'A', 'NICKNAME', 'FOR', "'MARGARET'?"],
 ['HUSBAND', "CAN'T", 'STOMACH', "WIFE'S", 'VOW', 'TO', 'DONATE', 'ORGANS'],
 ['LATE', 'ANSWER', 'FOR', 'EARLY', 'GUEST'],
 ['BLOOD', 'BANKS', 'SAVE', 'LIVES,', 'NOT', 'BLOOD'],
 ['TALK', 'OF', 'LONG', 'WALKS', 'WEARS', 'THIN', 'WITH', 'AGE'],
 ['HOLY',
  'COW!',
  'SOME',
  'READERS',
  'TAKE',
  'A',
  'BULL',
  'BY',
  'THE',
  'HORNS'],
 ['UNTANGLING', 'THE', 'GNARLED', 'ROOTS', 'OF', 'THE', 'WORD', "'OK'"],
 ['SNOW', 'JOBS', 'DESERVE', 'A', 'CHILLY', 'RECEPTION'],
 ['Dear', 'Abby:', 'Mock', 'Raid', 'Camp', 'Misses', 'Point'],
 ['READERS', 'ADD', 'TIDBITS', 'ABOUT', "'LET", 'THEM', 'EAT', "CAKE'"],
 ['TEEN-ABUSING', 'FATHER', 'NEEDS', 'HELP'],
 ['TEENAGER', 'FEELS', 'GUILTY', 'ABOUT', 'LYING', 'TO', 'THE', 'HOSPITAL'],
 ['YOUR', 'LIVING', 'WILL', 'MAY', 'EX

In [8]:
num_topics = 8

In [9]:
id2word = gensim.corpora.Dictionary(titles)
corpus = [id2word.doc2bow(text) for text in titles]

lda = ldamodel.LdaModel(corpus=corpus, 
                        id2word=id2word, 
                        num_topics=num_topics)
lsi = LsiModel(corpus=corpus, 
               num_topics=num_topics, 
               id2word=id2word)

In [10]:
def get_topics(model = lda, num_topics =num_topics):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn=12392)
        word_dict['Topic #'+'{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict,data_total[['title']])
    

In [11]:
topics = pd.DataFrame(get_topics())

topics_1 = pd.DataFrame(get_topics(model=lsi))

In [12]:
display(topics.head(4))

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08
"(A POINTED REMINDER ABOUT HOW CROWDED IT'S GETTING,)",More,May,Is,Is,Wedding,,After,Friend
"(AMERICAN SMOKEOUT DAY CHANCE TO SNUFF OUT HABIT,)",Dad's,Dad,Mom,Family,Wants,For,Home,Husband's
"(HOW DID 'PEGGY' BECOME A NICKNAME FOR 'MARGARET'?,)",Grandma,Sister,With,Husband,On,Is,Will,Of
"(HUSBAND CAN'T STOMACH WIFE'S VOW TO DONATE ORGANS,)",Return,To,Her,Wife,Leaves,Help,Gets,In


In [13]:
display(topics_1.head(4))

Unnamed: 0,Topic #01,Topic #02,Topic #03,Topic #04,Topic #05,Topic #06,Topic #07,Topic #08
"(A POINTED REMINDER ABOUT HOW CROWDED IT'S GETTING,)",Is,Is,A,Her,With,Woman,,
"(AMERICAN SMOKEOUT DAY CHANCE TO SNUFF OUT HABIT,)",Her,Her,TO,Wife,Wife,Mom,With,With
"(HOW DID 'PEGGY' BECOME A NICKNAME FOR 'MARGARET'?,)",With,With,OF,With,Woman,With,Mom,Be
"(HUSBAND CAN'T STOMACH WIFE'S VOW TO DONATE ORGANS,)",Wife,Woman,FOR,His,Her,Man,Be,Wife


In [14]:
lsi_topics = lsi.show_topics(num_topics=num_topics, formatted=False)

display(lsi_topics)

[(0,
  [('Is', -0.8093704576773872),
   ('Her', -0.22419011523203447),
   ('With', -0.19319732554477664),
   ('Wife', -0.16161221104176482),
   ('Woman', -0.14825929963921014),
   ('Mom', -0.12333710897430152),
   ('Man', -0.12230557461238145),
   ('His', -0.1108095934141122),
   ('Husband', -0.09151112632991577),
   ('To', -0.09088772376266073)]),
 (1,
  [('Is', 0.5395498587339791),
   ('Her', -0.5062516981204075),
   ('With', -0.33882691975262713),
   ('Woman', -0.25303327508109247),
   ('Wife', -0.21640902807264303),
   ('Mom', -0.16889128794996375),
   ('Man', -0.154403164772361),
   ('His', -0.11878784947789511),
   ('Husband', -0.11171353497580917),
   ('Who', -0.10895548603065583)]),
 (2,
  [('A', -0.9014315463924958),
   ('TO', -0.21840769554297954),
   ('OF', -0.13752210425933517),
   ('FOR', -0.12521042996781206),
   ('THE', -0.12005207411618979),
   ('Her', 0.11579410322968775),
   ('To', -0.104749475828675),
   ('BE', -0.08218643923101387),
   ('Is', 0.08079628338910155),
 