In [108]:
import pandas as pd
import KAsql2 as ka

BIDNUMS = 10# Number of "top bids" to extract

In [109]:
def find_top_body_topic(body_word, topic_word_df):
    #Find topic number for highest rating of hip
    ###   body_word = e.g., 'ankle' or 'knee'
    # topic_word_df:  30 topics by 10 words (top 10)
    
    body_index=-1
    body_topic=-1
    body_word_col = -1
    
    for col in topic_word_df.columns[1:]:#column 0 is the topic number
        words = topic_word_df[col].tolist()
        found = -1
        for i,word in enumerate(words):
            found = word.find(body_word)#found is the index at which the given str was found in word. if not found: -1
            if found>=0 and body_topic==-1:
                #print found,word,i
                body_topic = i
                body_word_col = col
    #print "myword: ",body_word, '.  topic: ',body_topic, '.  word column: ',body_word_col
    return body_topic,body_word_col

In [110]:
# Given a body part, find the 3 doctors most "talked about" with respect to that body part
def top_body_docs(body_topic, bid_df):
    # Given the topic # (body_topic) find the top three bids (doctors) who have the highest scores on this topic
    # Return a series where index = BID and values are topic weights/"scores" for the body key-word (e.g., "hip")
    mysorted = bid_df.sort(body_topic, axis=0, ascending=False)#21-hip-visdata
    idx = mysorted.index[0:BIDNUMS]#idx is the BID in this case
    #print mysorted[0:3]
    body_score = mysorted.loc[idx,body_topic]
    return body_score#series where BID is the index

In [111]:
def get_nrevs_for_bid(body_score):
    #body_score is series with BID as index.
    
    nrevs = {}
    for i in range(0,BIDNUMS):
        bid = body_score.index[i]
        sql = 'select nreviews from business where id='+str(bid)+';'#bid is a string
        rows = ka.query_SQL(sql)
        unpack_bid = rows[0][0]
        #nrevs.append(unpack_bid)
        nrevs[body_score.index[i]]=unpack_bid
    return nrevs##dict, key=bid, value=nrevs

In [134]:
def main():
       
    #BIDNUMS = 6# Number of "top bids" to extract - defined as global variable at the top
    
    #Load Data
    bid_df = pd.read_pickle('bid_tmeans.p')#index of this df is the bid, but can't be indexed by 'BID'
    topic_word_df = pd.read_pickle('topic_word_df.p')
    #names_df = pd.read_pickle('names_df.p')
    
    # INITIALIZE MASTER DATAFRAME
    master_df = pd.DataFrame()
    
    # Given the topic-word matrix, find the topic in which a body part (hip) is most highly ranked.
    word_topn_dict = {}
    bid_topn_dict = {}
    body_key = ['knee','hip','wrist','injection','mri','acl','best','care','minute','ive','appointment']
    
    # For each keyword, get the associated top topic and word-number in the topic-word df
    for key in body_key:
        #Gets row/topic and col/word_num in topic_word_df where the first instance of that body word occurs
        [body_topic,body_word_num] = find_top_body_topic(key, topic_word_df)
        #print 'validate: ',topic_word_df.ix[body_topic,body_word_num]
        print "myword: ",key, '.  topic: ',body_topic, '.  word column: ',body_word_num
        word_topn_dict[key]=body_topic# dict:  word-topic#
        
        # get the top N BIDs for this topic
        top_bids = top_body_docs(body_topic,bid_df)#series where BID is the index
        
        # Map bids onto topic numbers
        for b in top_bids.index:
            bid_topn_dict[b]=body_topic
        # Make DF:  BID-nreviews, sorted by highest nreviews
        bid_revs_dict = get_nrevs_for_bid(top_bids)
        nrev_df = pd.DataFrame.from_dict(bid_revs_dict.items())
        nrev_df.columns = ['BID','nreviews']
        nrev_df.sort('nreviews', ascending=False)
        
        # Select the top BID based on highest nrevs
        master_df = master_df.append([nrev_df], ignore_index=True)
    
    final_bids = master_df.BID[master_df.nreviews>=12]
    
    print '\n'
    for myid in final_bids:
        print 'bid',myid,'topic num: ', bid_topn_dict[myid]
    for word in word_topn_dict.iteritems():
        print 'topic num:',word[1],'word:',word[0]
    
    #f = lambda someint: str(someint)
    #names_df.BID = names_df['BID'].apply(f)#make the BID an int
    
   
if __name__=="__main__":
    main()

myword:  knee .  topic:  2 .  word column:  0
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
myword:  hip .  topic:  0 .  word column:  0
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
myword:  wrist .  topic:  19 .  word column:  0
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
myword:  injection .  topic:  16 .  word column:  4
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
myword:  mri .  topic:  3 .  word column:  1
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
myword:  acl .  topic:  24 .  word column:  2
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
USE ortho;
myword:  best .  topic:  21 .  word column:  0
USE ortho;


In [24]:
with open("Topic22_RR2.txt", "r") as text_file:
    lines = text_file.read()  
#print lines