# Project: App-Review Miner
Team members: Shanshan Li, Yingyezhe Jin, Tianshu Chu, Xiao Huang

In [5]:
# Put all import here
import numpy as np
import lda
import lda.datasets

### NLP based preprocessing

    Inputs:  raw_data        
    Outputs: AllR      is an n*m np matrix, n is the number of total reviews, m is the length of vocabulary
             vocab     is the corresponding n vocabulary list

In [2]:
# Code here

### Naive Bayes based filtering

    Inputs:  AllR      is an n*m np matrix, n is the number of total reviews, m is the length of vocabulary
    Outputs: InfoR     is a k*m np matrix, k is the number of informative reviews, m is the length of vocabulary

In [3]:
# Code here

### LDA topic clustering

    Inputs:  InfoR    is the k*m np matrix, k is the number of informative reviews, m is the length of vocabulary
             vocab    is the corresponding n vocabulary list
             n_topics is the number of topics
    Outputs: doc_topi is a k*n_topics np matrix, which indicates the probability

In [8]:
# Inputs here
InfoM = lda.datasets.load_reuters()
[k,m] = InfoM.shape
vocab = lda.datasets.load_reuters_vocab()
n_topics = 20
# Code here
model = lda.LDA(n_topics, n_iter=1500, random_state=1,refresh=500)
model.fit(InfoM) 
topic_word = model.topic_word_
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))
doc_topi = model.doc_topic_

INFO:lda:n_documents: 395
INFO:lda:vocab_size: 4258
INFO:lda:n_words: 84010
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1051748
INFO:lda:<500> log likelihood: -658407
INFO:lda:<1000> log likelihood: -655849
INFO:lda:<1499> log likelihood: -655858


Topic 0: british churchill sale million major letters west britain
Topic 1: church government political country state people party against
Topic 2: elvis king fans presley life concert young death
Topic 3: yeltsin russian russia president kremlin moscow michael operation
Topic 4: pope vatican paul john surgery hospital pontiff rome
Topic 5: family funeral police miami versace cunanan city service
Topic 6: simpson former years court president wife south church
Topic 7: order mother successor election nuns church nirmala head
Topic 8: charles prince diana royal king queen parker bowles
Topic 9: film french france against bardot paris poster animal
Topic 10: germany german war nazi letter christian book jews
Topic 11: east peace prize award timor quebec belo leader
Topic 12: n't life show told very love television father
Topic 13: years year time last church world people say
Topic 14: mother teresa heart calcutta charity nun hospital missionaries
Topic 15: city salonika capital buddhist c

In [11]:
doc_topic = model.doc_topic_
ClustNum = []
# print doc_topic[1]
for i in range(k):
    ClustNum.append(doc_topic[i].argmax())
print ClustNum

[8, 13, 14, 8, 14, 14, 14, 14, 14, 8, 8, 13, 14, 14, 8, 8, 8, 8, 6, 17, 17, 19, 11, 16, 4, 4, 8, 8, 12, 8, 8, 1, 4, 8, 8, 8, 8, 6, 6, 13, 14, 13, 4, 13, 17, 1, 1, 17, 17, 3, 3, 17, 4, 16, 13, 9, 9, 13, 13, 1, 17, 11, 1, 8, 10, 4, 4, 10, 18, 4, 4, 17, 4, 12, 4, 4, 4, 16, 16, 11, 11, 11, 11, 13, 10, 4, 11, 4, 12, 12, 13, 10, 0, 0, 13, 4, 4, 1, 0, 12, 4, 10, 5, 13, 10, 4, 4, 4, 1, 3, 3, 3, 19, 1, 3, 3, 3, 5, 4, 17, 17, 17, 17, 17, 17, 13, 11, 14, 13, 17, 0, 6, 14, 14, 15, 6, 13, 2, 3, 17, 8, 9, 1, 14, 5, 11, 15, 15, 15, 3, 4, 11, 8, 6, 1, 15, 19, 3, 9, 5, 13, 11, 13, 16, 13, 0, 13, 0, 0, 14, 8, 8, 3, 3, 1, 3, 10, 10, 8, 13, 10, 13, 5, 8, 10, 13, 8, 13, 8, 6, 13, 2, 5, 5, 6, 9, 6, 11, 1, 18, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 18, 18, 18, 18, 16, 16, 18, 13, 18, 5, 10, 13, 1, 12, 18, 18, 9, 10, 9, 3, 13, 2, 11, 7, 7, 9, 9, 10, 9, 13, 3, 19, 19, 3, 1, 13, 7, 9, 6, 7, 7, 1, 8, 7, 3, 0, 16, 7, 7, 13, 1, 7, 1, 11, 9, 6, 6, 16, 0, 13, 11, 13, 1, 19, 13, 3, 1, 19, 4, 13, 9, 8, 1, 12, 

### Ranking all the groups based on importance

    Inputs:  
    Outputs: 

### Ranking all the reviews in each group based on text rank

    Inputs:  
    Outputs: 

In [None]:
# Code here