In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

In [2]:
DATASET_FILE = "./output.csv"
df = pd.read_csv(DATASET_FILE)
df_initial = pd.read_csv(DATASET_FILE)
df= df[df['stars'].isin([1,2])]
df=df[['text_cleaned']]
df.text_cleaned

0       food not_memorable curry balance flavor not_th...
1       not_return sit booth wait dinner come scurry m...
2       wish experience great din night week ago meal ...
3       rosemary grapefruit scone suppose taste like b...
4       Our takeout order half miss portion size If bu...
                              ...                        
9995            vegetable not_enough meat The broth plain
9996    work hear new place open co-worker decide try ...
9997    went dinner drink good food We order popcorn s...
9998    food mediocre not_horrible not_great I sausage...
9999    This second time think food decent The come se...
Name: text_cleaned, Length: 10000, dtype: object

In [3]:
tv = TfidfVectorizer(stop_words=['good','great'], ngram_range = (1,1), max_df = .8, min_df = .01)

data_tv = tv.fit_transform(df.text_cleaned.apply(lambda x: np.str_(x)))
data_dtm = pd.DataFrame(data_tv.toarray(), columns=tv.get_feature_names())




In [4]:
def display_topics(model, feature_names, num_top_words, topic_names=None):
    # iterate through topics in topic-term matrix, 'H' aka
    # model.components_
    for ix, topic in enumerate(model.components_):
        #print topic, topic number, and top words
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

In [8]:
nmf_model = NMF(15, random_state=2)
# Learn an NMF model for given Document Term Matrix 'V' 
# Extract the document-topic matrix 'W'
doc_topic = nmf_model.fit_transform(data_dtm)
# Extract top words from the topic-term matrix 'H' display_topics(nmf_model, tv_noun.get_feature_names(), 5)
display_topics(nmf_model, tv.get_feature_names(), 5)




Topic  0
table, we, come, server, waitress

Topic  1
like, taste, it, bland, flavor

Topic  2
service, food, slow, horrible, terrible

Topic  3
customer, tell, rude, work, ask

Topic  4
chicken, fry, cheese, order, meat

Topic  5
minute, wait, 10, 30, 15

Topic  6
price, small, portion, quality, high

Topic  7
restaurant, food, there, menu, chinese

Topic  8
place, this, star, if, eat

Topic  9
bar, drink, beer, bartender, night

Topic  10
hour, half, delivery, pizza, cold

Topic  11
bad, experience, review, attitude, terrible

Topic  12
time, use, year, location, love

Topic  13
not_be, return, will, decent, soon

Topic  14
they, charge, horrible, and, mess




In [25]:
dfr=pd.DataFrame(doc_topic)
dict={}
for i in range(15):
  top3=dfr.nlargest(3, i, keep='all')
  top3Indexes=top3[0].keys().tolist()
  
  dict[i]=df_initial['text'][top3Indexes].tolist()
dff= pd.DataFrame(dict)
dff

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Wife and I went out for an early dinner and ar...,Worst chai latte I have ever had. It tasted li...,Combine overpriced yet insipid food with frank...,I called due to a ad on a website provided by ...,Two attempts and neither went well. Ate in an...,I ordered takeout and I was told it would be r...,"Crazy crazy high prices. Stunning, really. Bre...",They didn't accept restaurant com coupons.,Want good Italian food for the same price? Go ...,Bad pours on drinks and a rude bar manger. Ne...,This place is TERRIBLE!!! The food is just ave...,Pad Thai is sweet (not real pad thai) and quit...,I used to eat here all the time and LOVED it.....,Their beignets shouldn't be called beignets! I...,Watch out for the mice! They actually have a n...
1,Horrible experience. We have eaten here befor...,レビューが良かったので行って見たけど、残念でした。日本のみたい...なんて間違っても言えない...,Very mediocre Steaks. Horrible risotto. Great ...,"Just came to this restaurant, came in @ around...",Had a very disappointing experience. Ordered ...,The place was half full and they told us there...,Do not go here!!!! Super expensive and not go...,Purchased a www.restaraunt.com certificate for...,This place is closed and it's not mentioned on...,Go for the drinks at the bar. The food is med...,Worst Dominos. Over a two an a half hour wait...,Fake Chinese food...worst Chinese food experie...,Pretty decent sushi actually. It's funny - I u...,Boo drive all the way down to south austin and...,This place is spacious but so chaotic. They ne...
2,"THE SERVICE WAS HORRIBLE. Tara, my group's ser...","Disgusting and bland, their kitchen looks smal...",Do yourself a favor and go somewhere else. Fo...,Rosie's Bakery is the no good. I'm not an exce...,We have ordered here several times and my husb...,Waited over two hours and never got pizza. Tri...,nice atmosphere with the comic figurines. tast...,Chinese restaurants are known for not being to...,"If I could give 0 stars, i would. The only IPA...","Decent drinks, but one of the bartenders was a...",Worst place to order delivery from. Put an ord...,Their food was quiet good. I went there when i...,I used to love this shit as a kid. \n\nDrivin...,Chinese restaurants are known for not being to...,Don't waste your time stopping. They don't ha...


In [24]:

topics_dict={}
topics_dict[0]="Slow table service"
topics_dict[1]="Bland food"
topics_dict[2]="Terrible food and service"
topics_dict[3]="rude employees"
topics_dict[4]="non delicious chicken and salad"
topics_dict[5]="long wait time"
topics_dict[6]="Expensive and small portions"
topics_dict[7]="Restaurant.com certificates refused"
topics_dict[8]="0 stars"
topics_dict[9]="bad bar service"
topics_dict[10]="Pizza delivery"
topics_dict[11]="bad asian food"
topics_dict[12]="shrimp/sushi"
topics_dict[13]="dirty restaurant"
topics_dict[14]="not organizes"
topics_dict

{0: 'Slow table service',
 1: 'Bland food',
 2: 'Terrible food and service',
 3: 'rude employees',
 4: 'non delicious chicken and salad',
 5: 'long wait time',
 6: 'Expensive and small portions',
 7: 'Restaurant.com certificates refused',
 8: '0 stars',
 9: 'bad bar service',
 10: 'Pizza delivery',
 11: 'bad asian food',
 12: 'shrimp/sushi',
 13: 'dirty restaurant',
 14: 'not organizes'}

In [26]:
import pickle
pickle.dump(tv, open("tfidf.vec", 'wb'))

In [27]:
pickle.dump(nmf_model, open("nmf.pkl", 'wb'))