# Project: App-Review Miner
Team members: Shanshan Li, Yingyezhe Jin, Tianshu Chu, Xiao Huang

In [1]:
# Put all import here
import numpy as np
import lda
import lda.datasets

### NLP based preprocessing

    Inputs:  datasetName
             rmStopWords control to remove stop words
             rmRareWords control to remove rarely occured words
             
    Outputs: trainSet    is a list of training reviews
             testSet     is a list of testing reviews
             unlabelSet  is a list of unlabeld reviews
             vocabulary  is the corresponding n vocabulary in a dictionary form {word, index}

In [2]:
%run ./AR_util.py
%run ./AR_reviewInstance.py
datasetName = "templerun2" # four apps : facebook, templerun2, swiftkey, tapfish

rmStopWords = False # Removing stop words lead to information loss and bad f-score
rmRareWords = True

# trainSet/testSet/unlabel: dictionary of {label, reviews} for review data
# vocabulary: dictionary len = V and the positional index of each term in the doc vector
# set skParse True to directly read of the data that has been filtered out
skParse = False
if(skParse == False):
    # the vocabulary is the words on the training set!
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

Vocabulary size for templerun2 : 5527
Training set Size: 1000
Testing set Size: 2000
Unlabeling set Size: 57559


### Naive Bayes based filtering

    Inputs:  train/test/unlabelSet   are the preprocessed reviews 
             vocabulary              is the corresponding vocabulary of the reviews
    Outputs: informMat    is a k*m np sparse matrix, k is the number of informative reviews, m is the length of vocabulary
             informRev    is a list of informative reviews

In [3]:
%run ./AR_classifier.py
# 1. Use the EM-NB or SVM to filter out the informative reviews
# informMat: the informative reviews in X x V sparse matrix from, X: documents size, V: vocabulary size
# informRev: corresponding reviews wrapped as a list of review instances
useSVM = True # SVM is way better than emnb in terms of the testing. 
               # But it may not filter out the information effectively
if(skParse == False):
    if(useSVM == False):
        informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
        informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)

    # write the result back to the file (optional)
    AR_writeReviews(informRev, datasetName)
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))

Average F-Score for the test data: 0.837037037037
Number of informative reviews: 14684


### LDA topic clustering

    Inputs:  informMat  is the k*m np sparse matrix, k is the number of informative reviews, m  is the length of vocabulary
             informRev  is the informative review list
             vocabulary is the corresponding vocabulary dictionary
             n_topics   is the number of topics
    Outputs: doc_topic  is a k*n_topics np matrix, which indicates the probability
             vocab      is the vocabulary in the list form

In [4]:
%run ./AR_lda.py
# 2. Use the LDA to do the grouping based on the topic
# doc_topi : a k*n_topics np matrix, which indicates the probability of each review belongs to one of the topic
# vocab: a list of vocabulary words
n_topics = 20
doc_topic, vocab = AR_lda(informRev, informMat, vocabulary, n_topics)


INFO:lda:n_documents: 14684
INFO:lda:vocab_size: 5527
INFO:lda:n_words: 238165
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -2449937
INFO:lda:<500> log likelihood: -1531270
INFO:lda:<1000> log likelihood: -1529038
INFO:lda:<1499> log likelihood: -1529411


Topic 0: it i time and close to forc
Topic 1: is the a game that but onli
Topic 2: t it won even the doesn i
Topic 3: on my lag it galaxi phone s
Topic 4: it a but game lag good s
Topic 5: i coin my and to all the
Topic 6: on my it phone download not fix
Topic 7: it i star 5 fix thi give
Topic 8: fix the updat game it pleas but
Topic 9: the to and more need of new
Topic 10: run the templ but first it a
Topic 11: i it but play game and wa
Topic 12: the it screen to and then i
Topic 13: the up power a and you it
Topic 14: i the game when me save and
Topic 15: the you and when a of on
Topic 16: i coin the object and 000 have
Topic 17: to the i and it turn when
Topic 18: i to t it can you if
Topic 19: the sensit to tilt need be is


In [5]:
"""
doc_topic = model.doc_topic_
ClustNum = []
# print doc_topic[1]
for i in range(k):
    ClustNum.append(doc_topic[i].argmax())
print ClustNum
"""

'\ndoc_topic = model.doc_topic_\nClustNum = []\n# print doc_topic[1]\nfor i in range(k):\n    ClustNum.append(doc_topic[i].argmax())\nprint ClustNum\n'

### Ranking all the groups based on importance

    Inputs:  
    Outputs: 

In [9]:
%run ./AR_ranker.py
wg = [0.5, 0.5]
group_scores = group_rank(doc_topic, wg, informRev)

TypeError: calc_volume() takes exactly 2 arguments (1 given)

### Ranking all the reviews in each group based on text rank

    Inputs:  
    Outputs: 

In [None]:
# Code here