# Project: App-Review Miner
Team members: Shanshan Li, Yingyezhe Jin, Tianshu Chu, Xiao Huang

In [1]:
# Put all import here
import numpy as np
import lda
import lda.datasets

### NLP based preprocessing

    Inputs:  datasetName
             rmStopWords control to remove stop words
             rmRareWords control to remove rarely occured words
             
    Outputs: trainSet    is a list of training reviews
             testSet     is a list of testing reviews
             unlabelSet  is a list of unlabeld reviews
             vocabulary  is the corresponding n vocabulary in a dictionary form {word, index}

In [12]:
%run ./AR_util.py
%run ./AR_reviewInstance.py
datasetName = "templerun2" # four apps : facebook, templerun2, swiftkey, tapfish

rmStopWords = False # Removing stop words lead to information loss and bad f-score
rmRareWords = True

# trainSet/testSet/unlabel: dictionary of {label, reviews} for review data
# vocabulary: dictionary len = V and the positional index of each term in the doc vector
# set skParse True to directly read of the data that has been filtered out
skParse = False
if(skParse == False):
    # the vocabulary is the words on the training set!
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

Vocabulary size for templerun2 : 5527
Training set Size: 1000
Testing set Size: 2000
Unlabeling set Size: 57559


### Naive Bayes based filtering

    Inputs:  train/test/unlabelSet   are the preprocessed reviews 
             vocabulary              is the corresponding vocabulary of the reviews
    Outputs: informMat    is a k*m np sparse matrix, k is the number of informative reviews, m is the length of vocabulary
             informRev    is a list of informative reviews

In [13]:
%run ./AR_classifier.py
# 1. Use the EM-NB or SVM to filter out the informative reviews
# informMat: the informative reviews in X x V sparse matrix from, X: documents size, V: vocabulary size
# informRev: corresponding reviews wrapped as a list of review instances
useSVM = True # SVM is way better than emnb in terms of the testing. 
               # But it may not filter out the information effectively
if(skParse == False):
    if(useSVM == False):
        informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
        informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)

    # write the result back to the file (optional)
    AR_writeReviews(informRev, datasetName)
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))

Average F-Score for the test data: 0.837037037037
Number of informative reviews: 14684


### LDA topic clustering

    Inputs:  informMat  is the k*m np sparse matrix, k is the number of informative reviews, m  is the length of vocabulary
             informRev  is the informative review list
             vocabulary is the corresponding vocabulary dictionary
             n_topics   is the number of topics
    Outputs: doc_topic  is a k*n_topics np matrix, which indicates the probability
             vocab      is the vocabulary in the list form

In [14]:
%run ./AR_lda.py
# 2. Use the LDA to do the grouping based on the topic
# doc_topi : a k*n_topics np matrix, which indicates the probability of each review belongs to one of the topic
# vocab: a list of vocabulary words
n_topics = 20
doc_topic, vocab = AR_lda(informRev, informMat, vocabulary, n_topics)


INFO:lda:n_documents: 14684
INFO:lda:vocab_size: 5527
INFO:lda:n_words: 238165
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -2449521
INFO:lda:<500> log likelihood: -1530662
INFO:lda:<1000> log likelihood: -1530186
INFO:lda:<1499> log likelihood: -1531980


Topic 0: it i to close and download time
Topic 1: it but game a lag is sometim
Topic 2: it i me play t game my
Topic 3: i and coin the for have but
Topic 4: run the templ but first i it
Topic 5: the i when game and me save
Topic 6: on my it phone galaxi not work
Topic 7: it star i fix 5 thi but
Topic 8: you to have if that a can
Topic 9: the up power you a and when
Topic 10: the is that onli a of problem
Topic 11: the updat on fix game lag it
Topic 12: the it screen to and then i
Topic 13: coin object the i 000 have but
Topic 14: i coin my and to all game
Topic 15: the and but a more game new
Topic 16: t i can it doesn the don
Topic 17: the to i and when turn jump
Topic 18: a it to but time take game
Topic 19: to the sensit tilt need be is


In [11]:
doc_topic = model.doc_topic_
ClustNum = []
# print doc_topic[1]
for i in range(k):
    ClustNum.append(doc_topic[i].argmax())
print ClustNum

[8, 13, 14, 8, 14, 14, 14, 14, 14, 8, 8, 13, 14, 14, 8, 8, 8, 8, 6, 17, 17, 19, 11, 16, 4, 4, 8, 8, 12, 8, 8, 1, 4, 8, 8, 8, 8, 6, 6, 13, 14, 13, 4, 13, 17, 1, 1, 17, 17, 3, 3, 17, 4, 16, 13, 9, 9, 13, 13, 1, 17, 11, 1, 8, 10, 4, 4, 10, 18, 4, 4, 17, 4, 12, 4, 4, 4, 16, 16, 11, 11, 11, 11, 13, 10, 4, 11, 4, 12, 12, 13, 10, 0, 0, 13, 4, 4, 1, 0, 12, 4, 10, 5, 13, 10, 4, 4, 4, 1, 3, 3, 3, 19, 1, 3, 3, 3, 5, 4, 17, 17, 17, 17, 17, 17, 13, 11, 14, 13, 17, 0, 6, 14, 14, 15, 6, 13, 2, 3, 17, 8, 9, 1, 14, 5, 11, 15, 15, 15, 3, 4, 11, 8, 6, 1, 15, 19, 3, 9, 5, 13, 11, 13, 16, 13, 0, 13, 0, 0, 14, 8, 8, 3, 3, 1, 3, 10, 10, 8, 13, 10, 13, 5, 8, 10, 13, 8, 13, 8, 6, 13, 2, 5, 5, 6, 9, 6, 11, 1, 18, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 6, 18, 18, 18, 18, 16, 16, 18, 13, 18, 5, 10, 13, 1, 12, 18, 18, 9, 10, 9, 3, 13, 2, 11, 7, 7, 9, 9, 10, 9, 13, 3, 19, 19, 3, 1, 13, 7, 9, 6, 7, 7, 1, 8, 7, 3, 0, 16, 7, 7, 13, 1, 7, 1, 11, 9, 6, 6, 16, 0, 13, 11, 13, 1, 19, 13, 3, 1, 19, 4, 13, 9, 8, 1, 12, 

### Ranking all the groups based on importance

    Inputs:  
    Outputs: 

### Ranking all the reviews in each group based on text rank

    Inputs:  
    Outputs: 

In [None]:
# Code here