In [25]:
import importlib
import logging
import pandas as pd
import pickle
import re
import warnings

from glob import glob
import load
from tqdm import tqdm_notebook
import transform

importlib.reload(load)
importlib.reload(transform)

%matplotlib inline

logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings("ignore", category=DeprecationWarning)

pd.set_option('display.max_colwidth', -1)
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [26]:
def get_model(num_topics, tag):
    path = "output/{}/{}/model.pkl".format(tag, num_topics)
    model = None
    with open(path, "rb") as mfile:
        model = pickle.load(mfile) 
    return model

def format_topics_sentences(ldamodel, corpus, texts):
    pbar = tqdm_notebook(total=len(corpus))
    sent_topics_df = pd.DataFrame()
    
    # Get main topic in each document
    try:
        for i, row in enumerate(ldamodel[corpus]):
            pbar.update(1)
            try:
                row = [i for i in row if len(i) > 0][0]
                if len(row) < 1:
                    break
                row = sorted(row, key=lambda x: (x[1]), reverse=True)
                # Get Dominant topic, ratio, and keywords for each document
                for j, (topic_num, prop_topic) in enumerate(row):
                    if j == 0: # dominant topic
                        wp = ldamodel.show_topic(topic_num)
                        topic_keywords = ", ".join([word for word, prop in wp])
                        sent_topics_df = sent_topics_df.append(
                            pd.Series([int(topic_num),round(prop_topic,4),topic_keywords]),ignore_index=True)
                    else:
                        break
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)
    sent_topics_df.columns = ['Dominant_Topic', 'Percent_Contrib', 'Topic_Keywords']
    
    # Add original text to the end
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df

def group_top_docs(df_topics_docs, top_n=5):
    df_grpd = df_topics_docs.groupby('Dominant_Topic')
    
    to_concat = []
    for i, grp in df_grpd:
        to_concat.append(grp.sort_values(['Percent_Contrib'], ascending=[0]).head(top_n))
    sent_topics_sorteddf = pd.concat(to_concat, axis=0)

    # Reset Index
    sent_topics_sorteddf.reset_index(drop=True, inplace=True)

    # Format
    sent_topics_sorteddf.columns = ['Topic_Num', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    
    return sent_topics_sorteddf


In [32]:
# TODO: Save the intermediate data we don't have to reprocess original data
observations = load.load_data_from_psql(["AskWomen"], table_name="preprocessed_posts")
logging.debug(observations.head())
observations = transform.transform(observations, ngrams=3, threshold=50)
texts = list(observations['process_body'].values)
corpus = [model.id2word.doc2bow(text) for text in texts]
print(texts[:3])

HBox(children=(IntProgress(value=0, max=104071), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  observations['process_body'] = _make_ngrams_(data_words_nostops, ngrams, threshold)


[[], ['til', 'dentists', 'work', 'government'], ['taylor_swift']]


In [33]:
tag1 = "AskWomen_ldafinal_2018_08_22_2352"

model = get_model(9, tag1)

df_topics_docs = format_topics_sentences(ldamodel=model, corpus=corpus, texts=texts)
print(df_topics_docs.head())

# Group top 5 sentences under each topic
sent_topics_sorteddf = group_top_docs(df_topics_docs, top_n=5)
sent_topics_sorteddf.head()

HBox(children=(IntProgress(value=0, max=104071), HTML(value='')))

   Dominant_Topic  Percent_Contrib  \
0  0.0             0.3101            
1  2.0             0.2805            
2  0.0             0.3101            
3  2.0             0.2990            
4  0.0             0.3749            

                                                     Topic_Keywords  \
0  people, im, make, want, feel, things, something, see, good, much   
1  work, im, dog, take, kid, home, live, use, job, around             
2  people, im, make, want, feel, things, something, see, good, much   
3  work, im, dog, take, kid, home, live, use, job, around             
4  people, im, make, want, feel, things, something, see, good, much   

                                   0  
0  []                                 
1  [til, dentists, work, government]  
2  [taylor_swift]                     
3  [lose, shit, gumps, lol]           
4  [try, reply, someone]              


Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.8256,"people, im, make, want, feel, things, something, see, good, much","[mine, weird, doesnt, make, huge_difference, whether, im, sex, sexual, relationship, need, good, good, mean, partner, attentive, need, partner, doesnt, want, anything, sexual, im, sex, partner, isnt, overly, difficult, partner, isnt, seek, sexual, gratification, elsewhere, guess, mean, sex, isnt, important, absence, bad, sex, extremely, important, think, id, prefer, non, sexual, monogamous, romantic, relationship, relationship, involve, bad, sex]"
1,0.0,0.7748,"people, im, make, want, feel, things, something, see, good, much","[actually, become, good, nice, kind, person, someone, things, women, expectation, lead, sex, romance, whatever, get, bitter, angry, doesnt, honestly, ask, people, accept, normal, fine, say, everyone, world, automatically, return, feel, others, even, theyre, terrible, people]"
2,0.0,0.7614,"people, im, make, want, feel, things, something, see, good, much","[mind, worry, frequency, ones, seem, underlie, assumption, women, totally, different, men, like, completely, different, species, women, inner, self, come, mind, baffle, many, men, could, think, way, make, wonder, ever, interact, women, date, preferences, ones, less, worry, still, kind, sad, though, op, seem, genuinely, ignorant, thats, less, terrible, ops, belligerent, misogynistic]"
3,0.0,0.7567,"people, im, make, want, feel, things, something, see, good, much","[ive, get, better, grasp, view, various, subject, relate, sex, years, boundaries, sexual, deal_breakers, deal, makers, dont, know, anything, really, fundamentally, change, experience, sex, im, feminist, would, definitely, say, though, make, much, comfortable, define, sex, term, make, comfortable, idea, sex, find, pleasurable, rather, im, expect, find, pleasurable]"
4,0.0,0.7561,"people, im, make, want, feel, things, something, see, good, much","[gt, always, make, feel, better, gt, dont, know, guy, actually, feel, confident, mean, nice, thing, come, boyfriend, know, mean, make, happy, doesnt, change, way, view, wouldnt, say, make, feel, confident, come, anyone_else, hear, kind, make, feel, bite, uncomfortable, wary, general, feel, far, better, things, compliment, actually, make, feel, better, things, pertain, personality, accomplishments, skills, things, actually]"


In [35]:
sent_topics_sorteddf.groupby('Topic_Num').max()

Unnamed: 0_level_0,Topic_Perc_Contrib,Keywords,Text
Topic_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.8256,"people, im, make, want, feel, things, something, see, good, much","[mine, weird, doesnt, make, huge_difference, whether, im, sex, sexual, relationship, need, good, good, mean, partner, attentive, need, partner, doesnt, want, anything, sexual, im, sex, partner, isnt, overly, difficult, partner, isnt, seek, sexual, gratification, elsewhere, guess, mean, sex, isnt, important, absence, bad, sex, extremely, important, think, id, prefer, non, sexual, monogamous, romantic, relationship, relationship, involve, bad, sex]"
1.0,0.6051,"wear, buy, eat, food, dress, clothe, plus, short, size, fit","[right, watermelon, corn_cob, lime, lemons, fuji, apples, cilantro, green, onions, basil, jalepeno, plums, spinach, oregano, chicken_thighs, pork, shoulder, salsa, tortilla_chip, milk, greek_yogurt, cotija, cheese, beer, hot, pocket, couple, freeze, dinners]"
2.0,0.6391,"work, im, dog, take, kid, home, live, use, job, around","[room, eat, constantly, leave, doors, unlock, one, night, come, home, work, slide, glass, door, wide, open, laptop, right, front_door, im, mad, single, female, come, home, empty, house, midnight, bad, neighborhood, door, wide, open, hide, laptop, must, school, let, think, steal, keep, hide, awhile, let, stress, give, back, convince, parent, buy, another]"
3.0,0.5329,"askwomen, reddit, question, action, act, upon, reason, click, understand, give","[would, like, resubmit, edit, title, title_must, open, end, descriptive, searchable, question, https_www_reddit_com, askwomen_wiki, rule, include, personal, request, advice, without, wall, text, description, problem, literally, include, advice, please, may, otherwise, please, take, appropriate, sub, question, please, take, modmail, http_www_reddit_com, message_compose_askwomen, amp_subject_remove, amp_message_submission_https, www_reddit_com, askwomen, comment, fkvxz, remove_understand_reason_give, mod_act, upon]"
4.0,0.6518,"time, one, love, day, date, years, first, back, start, every","[start, date, meet, toronto, spend, weeks, together, months, relationship, travel, across, ocean, meet, foreign_country, week, say, love, months, together, ever, weeks, next]"
5.0,0.4937,"show, watch, read, face, book, eye, write, test, character, movie","[tamagotchis, pokemon, card, pogs, light, yoyos, beyblades, funfaxes, chatterboxes, alien, egg, baby, https_www_amazon, co_uk, ozbozz, sv, grossman, alien, egg, dp, ik, scooby, doos, http_www, hellokids, com, c_, kid, craft, activities, scooby, doo, bracelets]"
6.0,0.438,"guy, friends, call, play, school, shes, high, boyfriend, super, girl","[try, find, suggestions, havent, already, mention, two, edna, amp, harvey, http_store_steampowered, com, app, game, extremely, fun, point, amp, click, adventure, game, sci_fi, fps, horror, game, prey, http_store_steampowered, com, app, prey, similar, bioshock, system, shock, series, ftl, faster, light, http_store_steampowered, com, app, endlessly, enjoyable, space, strategy, game, crosscode, http_store_steampowered, com, app, crosscode, action, rpg, vr, world, transistor, http_store_steampowered, com, app, transistor, great, action, strategy, rpg, flame, flood, http_store_steampowered, com, app, survival, adventure, game, raft, flood, river, costume, quest, http_store_steampowered, com, app, costume_quest, amp, super, cheap, atm, great, funny, rpgs, undertale, http_store_steampowered, com, app, undertale, another, quirky, rpg, helens, mysterious, castle, http_store_steampowered, ...]"
7.0,0.6232,"hair, unless, beautiful, white, language, base, cancer, red, black, dark","[suzanne, wright, kresley, cole, nalini, singh, patricia, briggs, ilona, andrews, mariana, zapata, julia, quinn, lynsay, sand, debra, anastasia, maya, bank, laurann, dohner, jeaniene, frost, julie, garwood, molly, harper, thea, harrison, elizabeth, hoyt, joyce, penny, reid, mia, sheridan, gena, showalter, problem]"
8.0,0.5599,"periods, body, use, period, menopause, pain, compliment, skin, choice, touch","[stuff, animals, doodle, bear, https_www, google, com, search, doodle, bear, amp_tbm_isch_amp, imgil, ydw, uznmhidiym, bb, yrjwppmmim, bhttps, fwww, youtube_com, fwatch, fv, dphc_, amp, source, iu, amp, pf, amp, fir, ydw, uznmhidiym, cb, yrjwppmmim, c_, amp, usg, ygaafikmck, memyxdfzc, rs, amp_biw_amp_bih, amp_ved, ahukewjy, mkpjshuahvmct, kht, udxaqyjciqq, amp, ei, sszdwzjuoczi, qg, lwabw, imgrc, ydw, uznmhidiym]"


In [37]:
observations['process_body_str'] = observations['process_body'].apply(str)
observations.head()

Unnamed: 0,author,body,subreddit,process_body,process_body_str
4,ffreudiannipss,No/no,AskWomen,[],[]
18,cafeteriastyle,TIL dentists work for the government.,AskWomen,"[til, dentists, work, government]","['til', 'dentists', 'work', 'government']"
19,Itsthelegendarydays_,Taylor Swift\n,AskWomen,[taylor_swift],['taylor_swift']
21,whitefox-blackfox,"I lost my shit at Gumps, lol.",AskWomen,"[lose, shit, gumps, lol]","['lose', 'shit', 'gumps', 'lol']"
22,RockysTurtle,were you trying to reply to someone?,AskWomen,"[try, reply, someone]","['try', 'reply', 'someone']"


In [46]:
df = observations[['body', 'process_body_str']]

top_keywords = [
    ".*monogamous.*romantic.*relationship.*relationship.*involve.*",
    ".*watermelon.*corn_cob.*lime.*lemons.*",
    ".*home.*empty.*house.*midnight.*bad.*neighborhood.*",
    ".*problem.*literally.*include.*advice",
    ".*toronto.*spend.*weeks.*together.*",
    ".*grossman.*alien.*egg.*dp.*",
    ".*app.*transistor.*great.*action.*",
    ".*suzanne.*wright.*kresley.*cole.*",
    ".*stuff.*animals.*doodle.*bear.*"
]

for topic_num, keywords in enumerate(top_keywords):
    print(topic_num, df[df['process_body_str'].str.match(keywords)].body)
    p

0 8226    Mine is weird.  It doesn't make a huge difference to me whether or not I'm having sex, but if I am in a sexual relationship, it needs to be good.\n\nBy "good" I mean that my partner is attentive to my needs, my partner doesn't want to do anything sexual that I'm not into, having sex with my partner isn't overly difficult, and my partner isn't seeking sexual gratification elsewhere.\n\nSo I guess what this means is sex isn't all that important to me, but the absence of bad sex is extremely important. Now that I think about it, I'd prefer a non-sexual monogamous romantic relationship to a relationship that involved bad sex.
Name: body, dtype: object
1 102264    Right now:\n\n* Watermelon\n* Corn on the cob\n* limes\n* lemons\n* Fuji apples\n* cilantro\n* green onions\n* basil\n* jalepeno\n* plums\n* spinach\n* oregano\n* Chicken thighs\n* pork shoulder\n* salsa\n* tortilla chips\n* milk\n* greek yogurt\n* Cotija cheese\n* beer\n* hot pockets\n* a couple frozen dinners
Name: bod