In [1]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from numpy.linalg import svd

In [2]:
from nltk.corpus import stopwords

In [3]:
from prettytable import PrettyTable

In [4]:
def print_topics(H, word_list, n_words=15, n_topics='all', max_width=80):
    pt = PrettyTable(['Topic', 'Words'])
    pt.valign['Topic'] = 'm'
    pt.align['Words'] = 'l'
    pt.max_width=max_width
    if n_topics == 'all':
        nt = len(H)
    else:
        nt = n_topics
        
    for ix in range(nt):
        w = ', '.join(words[np.argsort(H[ix])[-n_words:][::-1]])
        pt.add_row([ix, w])
        pt.add_row(['', ''])
    print(pt)

In [5]:
with open('../pickles/features.pkl', 'rb') as f:
    features = pickle.load(f)

In [6]:
components = features['components']
directions = components['directions']
joined_directions = [' '.join(x) for x in directions]

In [7]:
stop_words = ['teaspoon', 'tablespoon'] + stopwords.words('english')

In [8]:
tf = TfidfVectorizer(ngram_range=(2,3), stop_words=stop_words)
tf_idf = tf.fit_transform(joined_directions)
words = np.array(tf.get_feature_names())

In [70]:
tf_idf.shape

(16991, 1110380)

In [27]:
mf = NMF(max_iter=300, n_components=10, verbose=0)
mf.fit(tf_idf)

W, H = mf.transform(tf_idf), mf.components_

In [19]:
W.shape, H.shape

((16991, 5), (5, 1110380))

In [28]:
print_topics(H, words)

+-------+----------------------------------------------------------------------------------+
| Topic | Words                                                                            |
+-------+----------------------------------------------------------------------------------+
|       | cook stirring, stirring often, cook stirring often, minutes add, oil medium,     |
|   0   | taste adjust, oil medium heat, salt taste, olive oil, large heavy, medium heat,  |
|       | heat olive oil, onion cook stirring, heat olive, onion cook                      |
|       |                                                                                  |
|       | preheat oven, 350 degrees, oven 350, oven 350 degrees, baking sheet, preheat     |
|   1   | oven 350, baking dish, baking powder, set aside, egg whites, large bowl, 30      |
|       | minutes, heat oven, oven bake, 15 minutes                                        |
|       |                                                             

In [43]:
match = (np.random.random(10) * len(W)).astype(int)

In [45]:
recipes = W[match]

In [32]:
from sklearn.cluster import KMeans

In [47]:
km = KMeans(n_clusters=5)
km.fit(recipes)
dist = km.transform(recipes)

In [65]:
print(dist[:,0])
print(dist[:,0].argsort())

[ 0.06314184  0.06772584  0.05869213  0.055518    0.06349057  0.06758303
  0.03195099  0.          0.07348587  0.07240054]
[7 6 3 2 0 4 5 1 9 8]


In [74]:
set(np.argsort(dist, axis=0).ravel())

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}