In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import nltk, string

from collections import defaultdict
from pymongo import MongoClient
from textblob import TextBlob 
from nltk.corpus import stopwords
from sklearn.externals import joblib 
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

pd.options.display.max_rows = 20
pd.options.display.max_columns = 20

%matplotlib inline

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

def get_top_words(model, feature_names, n_top_words):
    message = {}
    for topic_idx, topic in enumerate(model.components_):
        message[topic_idx] = (" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return message

def get_sub_category_names(topic, df, column, tfmodel, components): 
    """run the subtopic modeling, get the top words for each subtopic, 
    assign it to a dictionary with the key being the get_topics value"""
    dictionary = defaultdict(int) 
    df_new = df[df['topic']==topic]
    cat0 = tfmodel.transform(df_new[column])
    nmfnew = NMF(n_components = components).fit(cat0) #this runs the subtopic model
    subtopics = get_top_words(nmfnew, tfmodel.get_feature_names(), 3)
    for index, x in enumerate(subtopics):
        dictionary [index+ topic*components] = x
    return dictionary

def get_sub_category(topic, df, column, tfmodel, components):
    df_new = df[df['topic']==topic]
    cat0 = tfmodel.transform(df_new[column])
    nmfnew = NMF(n_components = components).fit(cat0)
    print_top_words(nmfnew, tfmodel.get_feature_names(), 10)
    df_new['subcategory'] = get_topics(nmfnew, cat0, topic, components)
    return df_new
    
def get_topics(NMFmodel, TFIDF, topic, components):
    nmf = NMFmodel.transform(TFIDF)
    topics = []
    for x in nmf:
        topics.append(np.argmax(x)+(topic*components))
    return topics

In [3]:
df = pd.read_pickle('df_posts') #Primary DataFrame, 25 topics
nmffit = joblib.load('nmffit') #Fitted TF-IDF vectors onto NMF model
tfmodel = joblib.load('tfmodel') #TF-IDF model 
tfvectors = joblib.load('tfvectors') #TF-IDF matrix

In [145]:
tfvectors

<36388x270552 sparse matrix of type '<class 'numpy.float64'>'
	with 2130600 stored elements in Compressed Sparse Row format>

In [10]:
cat0 = tfmodel.transform(df[df['topic']==0]['question'])


In [21]:
nmfnew = NMF(n_components = 5).fit(cat0)




In [133]:
print_top_words(nmffit, tfmodel.get_feature_names(), 10)


Topic #0: time first salt water best little used ingredients butter suggestions
Topic #1: chicken breast breasts chicken breast fried chicken chicken breasts fried stock thighs chicken stock
Topic #2: iron cast cast iron skillet iron skillet iron pan seasoning skillets lodge seasoned
Topic #3: rice rice cooker fried rice cooker brown rice fried white rice water white brown
Topic #4: pork pulled pork pulled belly pork belly shoulder pork shoulder chops pork chops fat
Topic #5: oil olive olive oil garlic frying canola oils deep vegetable oil salt
Topic #6: cream ice ice cream heavy heavy cream whipped whipped cream chocolate cream maker maker
Topic #7: sauce tomato tomato sauce soy soy sauce hot sauce hot tomatoes garlic sauces
Topic #8: pizza dough pizza dough bread flour stone yeast crust pizza stone homemade pizza
Topic #9: beef ground ground beef stew beef stew corned roast corned beef cut wellington
Topic #10: cheese mac mac cheese cheese sauce cheddar cream cheese macaroni cheeses 

In [57]:
df_test = get_sub_category(0, df, 'question', tfmodel, 5)
df_test = df_test.append(get_sub_category(1, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(2, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(3, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(4, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(5, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(6, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(7, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(8, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(9, df, 'question', tfmodel, 5))


Topic #0: garlic salt fresh pepper taste spices onion onions flavor wine
Topic #1: sugar butter baking flour bread cup soda pie water powder
Topic #2: chocolate cake cookies chocolate cake chocolate chip chip cocoa birthday mousse white chocolate
Topic #3: kitchen time new meals love home meal suggestions dishes dinner
Topic #4: favorite cookbook favorite cookbook pumpkin favorite thing love shows pie new share favorite



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Topic #0: chicken breast chicken breast roast chicken cooked roast sauce marinade time whole
Topic #1: fried chicken fried chicken buttermilk breading brine fry flour frying chicken buttermilk
Topic #2: stock chicken stock chicken bones whole chicken fat whole broth carcass stock chicken
Topic #3: breasts chicken breasts chicken boneless skinless marinate frozen chicken marinate chicken frozen boneless skinless
Topic #4: thighs chicken thighs wings chicken chicken wings skin boneless bone buffalo best

Topic #0: iron cast cast iron pans skillets iron skillets iron pans iron cookware cookware season
Topic #1: skillet iron skillet cast cast iron iron steak seasoned steaks oven 12
Topic #2: pan iron pan iron cast cast iron steak oil seasoning grill seasoned
Topic #3: steel wok carbon steel carbon iron stainless cast cast iron stainless steel pans
Topic #4: dutch dutch oven enameled oven cast iron cast iron dutch iron enameled cast pot

Topic #0: rice water cooked sticky grain white rice b

In [58]:
df_test = df_test.append(get_sub_category(10, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(11, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(12, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(13, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(14, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(15, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(16, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(17, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(18, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(19, df, 'question', tfmodel, 5))

Topic #0: cheese cheese sauce cheddar sauce blue blue cheese cheeses mozzarella grilled cheese grilled
Topic #1: mac mac cheese cheese baked mac homemade mac baked milk sauce homemade cheddar
Topic #2: cream cheese cream cheese frosting cheese frosting jalapeno stuffed dip jalapenos bacon
Topic #3: goat goat cheese cheese stuffed grits ash honey peach cheese stuffed goat milk
Topic #4: macaroni macaroni cheese cheese citrate sodium citrate sodium modernist redditors baked macaroni cheeses



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Topic #0: knife chef knife knife skills blade skills chefs knife kitchen knife set santoku paring knife
Topic #1: knives set kitchen knives kitchen set knives steak knives ceramic ceramic knives new knives sharpened
Topic #2: sharpening sharpen stone sharpener knife sharpening sharp whetstone knife stones sharpening stone
Topic #3: chef chef knife santoku chef table victorinox inch wusthof table netflix shun
Topic #4: honing steel honing steel rod honing rod whetstone amazon shun ceramic global

Topic #0: meat cut chicken cooked beef raw fat time steak cuts
Topic #1: meat grinder grinder meat ground burgers sausage ground meat grinding processor grind
Topic #2: thermometer meat thermometer meat grill accurate probe thermometer hi digital meat digital roasts
Topic #3: crab crab meat crab legs legs meat salad crab cakes crab salad lobster crabs
Topic #4: vegetarian vegan meat vegetarian vegan burrito filipino meals eat without meat favorite vegetarian

Topic #0: eggs poached poached eggs

In [59]:
df_test = df_test.append(get_sub_category(20, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(21, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(22, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(23, df, 'question', tfmodel, 5))
df_test = df_test.append(get_sub_category(24, df, 'question', tfmodel, 5))

Topic #0: potatoes roasted sweet potatoes roast baked oven baked potatoes onions carrots cooked
Topic #1: potato potato salad potato chips baked sweet potato salad chips baked potato mashed potato starch
Topic #2: mashed mashed potatoes potatoes garlic mashed mashed potato garlic dinner cheese gravy cream
Topic #3: sweet sweet potato sweet potatoes potato potato fries gnocchi potato casserole potato gnocchi casserole thanksgiving
Topic #4: fries french fries french crispy potato fries homemade french fries crispy fry polenta oil



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Topic #0: pan oil heat frying frying pan steak high bottom stove butter
Topic #1: steel stainless stainless steel steel pan carbon steel carbon steel pans wok steel cookware steel frying
Topic #2: non stick non stick stick pan ceramic teflon wok stick pans pan coating
Topic #3: pans pots pots pans nonstick set teflon stove electric heat aluminum
Topic #4: cookware clad set ply tri ply tri cookware set tramontina piece stainless

Topic #0: turkey bird oven deep year skin cooked turkey gravy time roasting
Topic #1: thanksgiving thanksgiving dinner year dinner thanksgiving meal thanksgiving year family thanksgiving turkey stuffing meal
Topic #2: brine brining dry brine dry bird salt brined turkey wet brine turkey brine
Topic #3: ground turkey turkey ground meat lean meats 20lbs ground turkey 90 gifted 20lbs meatloaf
Topic #4: turkey breast breast turkey chicken duck carcass chicken turkey smoked turkey netting meat

Topic #0: fish fish chips cod frozen seafood chips whole eat fishy macker

In [60]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36388 entries, 1 to 36365
Data columns (total 8 columns):
date              36388 non-null datetime64[ns]
score             36388 non-null int64
second_comment    36388 non-null object
top_comment       36388 non-null object
question          36388 non-null object
log_score         36388 non-null float64
topic             36388 non-null int64
subcategory       36388 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(3), object(3)
memory usage: 2.5+ MB


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36388 entries, 0 to 36387
Data columns (total 7 columns):
date              36388 non-null datetime64[ns]
score             36388 non-null int64
second_comment    36388 non-null object
top_comment       36388 non-null object
question          36388 non-null object
log_score         36388 non-null float64
topic             36388 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 1.9+ MB


In [131]:
df_test[df_test['topic'] == 1]


Unnamed: 0,date,score,second_comment,top_comment,question,log_score,topic,subcategory
2,2017-08-13 22:48:13,17,"Even when vanilla isn't accompanied by sugar, ...","yep. you could try to mask it, but honestly no...",Vanilla yogurt in chicken tandoori I'm in the ...,2.833213,1,5
15,2017-08-13 04:10:24,21,Tyler Florence has a good video\n\nhttp://www....,"Check with Julia Childs, she had that one on l...",My mother wishes to cook a chicken cordon bleu...,3.044522,1,5
25,2017-08-12 19:04:43,409,"Not saying you should roast a whole chicken, b...",When my wife is gone for the evening I get dr...,What do you cook when you're dining alone? Wif...,6.013715,1,5
41,2017-08-12 00:52:03,23,Just here to back up the other comments: bakin...,"First off, pat dry the chicken with paper towe...",How to get a lot of batter to stick to chicken...,3.135494,1,5
45,2017-08-11 18:01:24,54,"Just brainstorming here, would halving the pin...",Chicken? Why chicken? \n\nI'd use a pork loin...,Lets talk about baking a pineapple I want to h...,3.988984,1,5
53,2017-08-11 05:11:32,67,"If this dish seems to be common in London, you...","Maybe it is a variation on dak galbi, with a g...",Spicy chicken street food from a stall in Lond...,4.204693,1,5
75,2017-08-10 02:44:25,24,"Shred and put in salad, burritos tacos and que...",This won't earn me bonus points but I like to ...,"Leftover cooked chicken breast, easy uses? Tit...",3.178054,1,5
86,2017-08-09 19:40:31,12,Marinade is good. It almost doesn't matter how...,Totally! I work at a restaurant in Montana. an...,Can a buttermilk marinade be used to make chic...,2.484907,1,6
97,2017-08-08 14:04:49,7,Chicken is typically far cheaper so it's less ...,"Well, for one - Venison costs a lot more than ...","Why, when you are looking at a recipe for veni...",1.945910,1,5
98,2017-08-08 08:52:50,14,I asked something similar about a year ago. I...,The best butter chicken that I've tried to dat...,Looking for your favourite Butter Chicken reci...,2.639057,1,5


In [95]:
subtopiclabels = get_sub_category_names(0, df, 'question', tfmodel, 5)
subtopiclabels.update(get_sub_category_names(1, df, 'question', tfmodel, 5))


In [98]:
for x in range(2,25):
    subtopiclabels.update(get_sub_category_names(x, df, 'question', tfmodel, 5))

In [134]:
topiclabels = get_top_words(nmffit, tfmodel.get_feature_names(), 5)


In [135]:
topiclabels

{0: 'time first salt water best',
 1: 'chicken breast breasts chicken breast fried chicken',
 2: 'iron cast cast iron skillet iron skillet',
 3: 'rice rice cooker fried rice cooker brown rice',
 4: 'pork pulled pork pulled belly pork belly',
 5: 'oil olive olive oil garlic frying',
 6: 'cream ice ice cream heavy heavy cream',
 7: 'sauce tomato tomato sauce soy soy sauce',
 8: 'pizza dough pizza dough bread flour',
 9: 'beef ground ground beef stew beef stew',
 10: 'cheese mac mac cheese cheese sauce cheddar',
 11: 'knife knives chef chef knife sharpening',
 12: 'meat cut cooked thermometer cut meat',
 13: 'eggs egg scrambled whites scrambled eggs',
 14: 'oven dutch dutch oven stove roast',
 15: 'pasta pasta sauce fresh pasta water pasta water',
 16: 'cooker slow slow cooker pressure pressure cooker',
 17: 'sous vide sous vide steak steaks',
 18: 'soup stock broth bones onion',
 19: 'beans chili black black beans bean',
 20: 'potatoes potato mashed sweet mashed potatoes',
 21: 'pan stee

In [136]:
df_categories = df_test.groupby(['topic','subcategory'])['score'].agg(['size', 'sum']).reset_index()

In [137]:
df_categories['subcategory'] = df_categories['subcategory'].replace(subtopiclabels) 

In [138]:
df_categories['topic'] = df_categories['topic'].replace(topiclabels) 

In [140]:
df_categories.tail(10)

Unnamed: 0,topic,subcategory,size,sum
115,fish salmon fish sauce sushi fish tacos,fish fish chips cod,434,20526
116,fish salmon fish sauce sushi fish tacos,salmon smoked salmon skin,176,9173
117,fish salmon fish sauce sushi fish tacos,fish sauce fish sauce,48,2310
118,fish salmon fish sauce sushi fish tacos,sushi sushi grade grade,35,2814
119,fish salmon fish sauce sushi fish tacos,tacos fish tacos fish,34,1920
120,milk coconut curry coconut milk thai,milk almond almond milk,265,11096
121,milk coconut curry coconut milk thai,curry paste curry powder,185,8076
122,milk coconut curry coconut milk thai,coconut coconut milk milk,174,6277
123,milk coconut curry coconut milk thai,thai thai curry paste,63,4106
124,milk coconut curry coconut milk thai,chocolate milk chocolate cocoa,64,2089


In [141]:

lis = []
for topic in df_categories['topic'].unique():
    sub_list = []
    for sub in df_categories[df_categories['topic'] == topic]['subcategory']:  
        sub_list.append({"name" : sub, "size" : int(df_categories[df_categories['subcategory'] == sub]['size'])})
    lis.append({"name" : topic, "children" : sub_list})
topics = {'name' : 'Topics', 'children' : lis}

In [126]:
topics

{'children': [{'children': [{'name': 'time eat first', 'size': 3385},
    {'name': 'sugar butter baking', 'size': 1934},
    {'name': 'chocolate cake cookies', 'size': 553},
    {'name': 'kitchen favorite new', 'size': 7700},
    {'name': 'salt garlic pepper', 'size': 580}],
   'name': 'time eat first meals dinner'},
  {'children': [{'name': 'chicken breast chicken breast', 'size': 1066},
    {'name': 'fried chicken fried chicken', 'size': 159},
    {'name': 'stock chicken stock chicken', 'size': 162},
    {'name': 'breasts chicken breasts chicken', 'size': 179},
    {'name': 'thighs chicken thighs wings', 'size': 176}],
   'name': 'sugar butter baking flour bread'},
  {'children': [{'name': 'iron cast cast iron', 'size': 227},
    {'name': 'skillet iron skillet cast', 'size': 182},
    {'name': 'pan iron pan iron', 'size': 193},
    {'name': 'steel wok carbon steel', 'size': 88},
    {'name': 'dutch dutch oven enameled', 'size': 71}],
   'name': 'chocolate cake cookies chocolate cake 

In [129]:
import json

In [142]:
with open('topics.json', 'w') as f:
     json.dump(topics, f)

In [144]:
df_test.to_pickle('df_subcats')