# Project: App-Review Miner
Team members: Shanshan Li, Yingyezhe Jin, Tianshu Chu, Xiao Huang

In [1]:
# Put all import here
import numpy as np
import lda
import lda.datasets

### NLP based preprocessing

    Inputs:  datasetName
             rmStopWords control to remove stop words
             rmRareWords control to remove rarely occured words
             
    Outputs: trainSet    is a list of training reviews
             testSet     is a list of testing reviews
             unlabelSet  is a list of unlabeld reviews
             vocabulary  is the corresponding n vocabulary in a dictionary form {word, index}

In [2]:
%run ./AR_util.py
%run ./AR_reviewInstance.py
datasetName = "templerun2" # four apps : facebook, templerun2, swiftkey, tapfish

rmStopWords = False # Removing stop words lead to information loss and bad f-score
rmRareWords = True

# trainSet/testSet/unlabel: dictionary of {label, reviews} for review data
# vocabulary: dictionary len = V and the positional index of each term in the doc vector
# set skParse True to directly read of the data that has been filtered out
skParse = False
if(skParse == False):
    # the vocabulary is the words on the training set!
    trainSet, testSet, unlabelSet, vocabulary = AR_parse(datasetName, rmStopWords, rmRareWords)

Vocabulary size for templerun2 : 5527
Training set Size: 1000
Testing set Size: 2000
Unlabeling set Size: 57559


### Naive Bayes based filtering

    Inputs:  train/test/unlabelSet   are the preprocessed reviews 
             vocabulary              is the corresponding vocabulary of the reviews
    Outputs: informMat    is a k*m np sparse matrix, k is the number of informative reviews, m is the length of vocabulary
             informRev    is a list of informative reviews

In [3]:
%run ./AR_classifier.py
# 1. Use the EM-NB or SVM to filter out the informative reviews
# informMat: the informative reviews in X x V sparse matrix from, X: documents size, V: vocabulary size
# informRev: corresponding reviews wrapped as a list of review instances
useSVM = True # SVM is way better than emnb in terms of the testing. 
               # But it may not filter out the information effectively
if(skParse == False):
    if(useSVM == False):
        informRev, informMat = AR_emnb(trainSet, testSet, unlabelSet, vocabulary, datasetName)
    else:
        informRev, informMat = AR_svm(trainSet, testSet, unlabelSet, vocabulary, datasetName)

    # write the result back to the file (optional)
    AR_writeReviews(informRev, datasetName)
else:
    # directly read from the file
    informRev, informMat, vocabulary = AR_loadReviews(datasetName)

print("Number of informative reviews: " + str(len(informRev)))

Average F-Score for the test data: 0.837037037037
Number of informative reviews: 14684


### LDA topic clustering

    Inputs:  informMat  is the k*m np sparse matrix, k is the number of informative reviews, m  is the length of vocabulary
             informRev  is the informative review list
             vocabulary is the corresponding vocabulary dictionary
             n_topics   is the number of topics
    Outputs: doc_topic  is a k*n_topics np matrix, which indicates the probability
             vocab      is the vocabulary in the list form

In [4]:
%run ./AR_lda.py
# 2. Use the LDA to do the grouping based on the topic
# doc_topi : a k*n_topics np matrix, which indicates the probability of each review belongs to one of the topic
# vocab: a list of vocabulary words
n_topics = 20
doc_topic, vocab = AR_lda(informRev, informMat, vocabulary, n_topics)


INFO:lda:n_documents: 14684
INFO:lda:vocab_size: 5527
INFO:lda:n_words: 238165
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -2449521
INFO:lda:<500> log likelihood: -1530662
INFO:lda:<1000> log likelihood: -1530186
INFO:lda:<1499> log likelihood: -1531980


Topic 0: it i to close and download time
Topic 1: it but game a lag is sometim
Topic 2: it i me play t game my
Topic 3: i and coin the for have but
Topic 4: run the templ but first i it
Topic 5: the i when game and me save
Topic 6: on my it phone galaxi not work
Topic 7: it star i fix 5 thi but
Topic 8: you to have if that a can
Topic 9: the up power you a and when
Topic 10: the is that onli a of problem
Topic 11: the updat on fix game lag it
Topic 12: the it screen to and then i
Topic 13: coin object the i 000 have but
Topic 14: i coin my and to all game
Topic 15: the and but a more game new
Topic 16: t i can it doesn the don
Topic 17: the to i and when turn jump
Topic 18: a it to but time take game
Topic 19: to the sensit tilt need be is


In [None]:
doc_topic = model.doc_topic_
ClustNum = []
# print doc_topic[1]
for i in range(k):
    ClustNum.append(doc_topic[i].argmax())
print ClustNum

### Ranking all the groups based on importance

    Inputs:  
    Outputs: 

### Cluster the reviews using volume and use TextRank to rank each instance

    Inputs:  doc_topic  is a np matrix k*n_topic, where k is # of reviews
             informRev  is a list of all informative reviews
    Outputs: rankedInstance is a dict = {topic, list of ranked reviews with the score}

In [10]:
%run ./AR_textrank.py
AR_tfIdf(informRev)
rankedInstance = AR_textrank(doc_topic, informRev)

In construct the graph of reviews ---- Nodes: 1021 Edges: 38072
In construct the graph of reviews ---- Nodes: 1012 Edges: 21804
In construct the graph of reviews ---- Nodes: 894 Edges: 37456
In construct the graph of reviews ---- Nodes: 503 Edges: 3184
In construct the graph of reviews ---- Nodes: 751 Edges: 4436
In construct the graph of reviews ---- Nodes: 834 Edges: 20164
In construct the graph of reviews ---- Nodes: 803 Edges: 11438
In construct the graph of reviews ---- Nodes: 592 Edges: 6674
In construct the graph of reviews ---- Nodes: 672 Edges: 1214
In construct the graph of reviews ---- Nodes: 617 Edges: 5092
In construct the graph of reviews ---- Nodes: 595 Edges: 1742
In construct the graph of reviews ---- Nodes: 625 Edges: 3630
In construct the graph of reviews ---- Nodes: 1043 Edges: 35468
In construct the graph of reviews ---- Nodes: 567 Edges: 7974
In construct the graph of reviews ---- Nodes: 805 Edges: 5196
In construct the graph of reviews ---- Nodes: 726 Edges: 2752

In [9]:
# print the top 10 reviews:
for i in range(len(rankedInstance)):
    print("Instance review for topic group: " + str(i))
    print(rankedInstance[i][:10])

Instance review for topic group: 0
[(2829, 0.0010345541059186782), (4331, 0.0010345541059186782), (1137, 0.0010345541059186779), (8233, 0.0010345541059186775), (8510, 0.0010345541059186773), (8939, 0.0010345541059186773), (9388, 0.0010345541059186773), (615, 0.0010345541059186771), (1519, 0.0010345541059186771), (3660, 0.0010345541059186771)]
Instance review for topic group: 1
[(6882, 0.0010593220334581201), (4571, 0.0010593220334581199), (11393, 0.0010593220334581199), (13258, 0.0010593220334581199), (2417, 0.0010593220334581196), (3720, 0.0010593220334581196), (8428, 0.0010593220334581196), (12365, 0.0010593220334581196), (3109, 0.0010593220334581194), (6013, 0.0010593220334581194)]
Instance review for topic group: 2
[(3970, 0.0012181751711557894), (14265, 0.0012181751711557889), (45, 0.0012181751711557878), (753, 0.0012181751711557878), (12260, 0.0012181751711557876), (13207, 0.0012181751711557876), (3537, 0.0012181751711557874), (8642, 0.0012181751711557874), (5955, 0.0012181751711