In [1]:
import pandas as pd
import json
import pprint
import numpy as np
import nltk
import re
import heapq

In [2]:
user_reviews = {}

user_reviews['bug'] = {'data_train': 'Bug_Report_Data_Train.json', 
                      'not_data_train': 'Not_Bug_Report_Data_Train.json',
                      'data_test': 'Bug_Report_Data_Test.json',
                      'not_data_test': 'Not_Bug_Report_Data_Test.json',
                      'label': 'Bug',
                      'not_label': 'Not Bug'}

user_reviews['feature'] = {'data_train': 'Feature_OR_Improvment_Request_Data_Train.json', 
                          'not_data_train': 'Not_Feature_OR_Improvment_Request_Data_Train.json',
                          'data_test': 'Feature_OR_Improvment_Request_Data_Test.json',
                          'not_data_test': 'Not_Feature_OR_Improvment_Request_Data_Test.json',
                          'label': 'Feature',
                          'not_label': 'Not Feature'}

user_reviews['ux'] = {'data_train': 'UserExperience_Data_Train.json', 
                        'not_data_train': 'Not_UserExperience_Data_Train.json',
                        'data_test': 'UserExperience_Data_Test.json',
                        'not_data_test': 'Not_UserExperience_Data_Test.json',
                        'label': 'UserExperience',
                        'not_label': 'Not UserExperience'}

user_reviews['rating'] = {'data_train': 'Rating_Data_Train.json', 
                          'not_data_train': 'Not_Rating_Data_Train.json',
                          'data_test': 'Rating_Data_Test.json',
                          'not_data_test': 'Not_Rating_Data_Test.json',
                         'label': 'Rating',
                         'not_label': 'Not Rating'}

In [3]:
selected_review_type = user_reviews['bug'] # bug, feature, ux, rating

In [4]:
# Import data
with open('../RE2015_data/json_data/' + selected_review_type['data_train']) as data_file:    
    data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_train']) as data_file:    
    not_data_train = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['data_test']) as data_file:    
    data_test = json.load(data_file)
    
with open('../RE2015_data/json_data/' + selected_review_type['not_data_test']) as data_file:    
    not_data_test = json.load(data_file)

In [5]:
# Prepare data frame
data_train = pd.DataFrame.from_dict(data_train, orient='columns')
data_train['label'] = selected_review_type['label']

data_test = pd.DataFrame.from_dict(data_test, orient='columns')
data_test['label'] = selected_review_type['label']

not_data_train = pd.DataFrame.from_dict(not_data_train, orient='columns')
not_data_train['label'] = selected_review_type['not_label']

not_data_test = pd.DataFrame.from_dict(not_data_test, orient='columns')
not_data_test['label'] = selected_review_type['not_label']

df_train = data_train.append(not_data_train, ignore_index=True)
df_test = data_test.append(not_data_test, ignore_index=True)

df = df_train.append(df_test, ignore_index=True)

In [6]:
def count_word_frequencies(formatted_text):
    word_frequencies = {}  
    for word in nltk.word_tokenize(formatted_text):  
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
                
    return word_frequencies

In [7]:
def count_sentence_scores(sentence_list, word_frequencies):
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    
    return sentence_scores

In [8]:
def summarize(data_frame_column):
    for raw_text in data_frame_column:
        # Removing special characters and digits
        formatted_text = re.sub('[^a-zA-Z]', ' ', raw_text)  
        formatted_text = re.sub(r'\s+', ' ', formatted_text)

        sentence_list = nltk.sent_tokenize(raw_text)
        word_frequencies = count_word_frequencies(formatted_text)
        maximum_frequncy = max(word_frequencies.values())

        for word in word_frequencies.keys():  
            word_frequencies[word] = (word_frequencies[word] / maximum_frequncy)

        sentence_scores = count_sentence_scores(sentence_list, word_frequencies)

        summary_sentences = heapq.nlargest(1, sentence_scores, key=sentence_scores.get)
        summary = ' '.join(summary_sentences)  

        print('Raw Text:', raw_text)
        print()
        print('Summary:', summary)
        print('\n=======================================================================================\n')

In [9]:
stopwords = nltk.corpus.stopwords.words('english')

In [10]:
summarize(df['comment'])

Raw Text: Besides the occasional crash, this is an amazing product with tons of potential depending on how you work with it!

Summary: Besides the occasional crash, this is an amazing product with tons of potential depending on how you work with it!


Raw Text: This could be a great app if it was predictable, but it is full of bugs and is unpredictable.  if you are able to check in, take a screen shot of your boarding pass or print a backup copy, because you may not be able to access it when you need it most.

Summary: This could be a great app if it was predictable, but it is full of bugs and is unpredictable.


Raw Text: I can&#39t open since the last 2 updates Pop-ups go crazy on the iPhone 5! I&#39m uninstalling

Summary: I can&#39t open since the last 2 updates Pop-ups go crazy on the iPhone 5!


Raw Text: Use to love this app but it's not working after new update. Pages won't scroll up or down...none of the different tabs work...it's frozen! Please fix ASAP!!!

Summary: Pages won

Raw Text: I just tested this app out so that I could view Horrible Bosses as an UltraViolet Digital Copy, and I noticed that the movie was hard to see in some places because of what looked like really bad blurry areas right after the logos. This happened throughout the movie. This never happened to me with the regular digital copies that were compatible with iTunes. I could see everything clearly when I was using the regular digital copies. I think that if I have to buy these combo packs with UltraViolet Digital Copy, I will never use this app again. I'll stick to the UltraViolet/Flixster cloud because there it wasn't blurry at all. On top of this, since the update ALL the UltraViolet movies are buffering and skipping. This never happened with my last update. Please fix.

Summary: This never happened to me with the regular digital copies that were compatible with iTunes.


Raw Text: The app used to work great, but now it keeps freezing and I can&#39t get into my  notes It is very frust

Raw Text: Great UI that aggregates so much information into one place, great app! Super intuitive to use.

Summary: Great UI that aggregates so much information into one place, great app!


Raw Text: Great app. even for smaller regional airports.

Summary: even for smaller regional airports.


Raw Text: Great app

Summary: Great app


Raw Text: Idk	Why does it FOREVER to download!?! I have a galaxy 3 mini and its so long :'( Like i downloaded it cuz its says download over wifi and i accepted it and it was still downloading for two !@#$ing hrs!!!!!

Summary: Idk	Why does it FOREVER to download!?!


Raw Text: Awkward menus	I bought this app. ..quickly attempted to do my most common types of formatting, and found it very slowly and fiddly on my note 2.  Much better on the BlackBerry.  So, as quickly as I could, I attempt to un install and get a refund, can't!!! So am I stuck?

Summary: ..quickly attempted to do my most common types of formatting, and found it very slowly and fiddly on my 

Raw Text: Good app!	Useful for my needs.

Summary: Good app!


Raw Text: I love this app! 
Music on top of music ?

Summary: Music on top of music ?


Raw Text: Good	Outstanding

Summary: 


Raw Text: I keep tabs on my brother's flight info,  as he travels for work quite a bit. I'm a worrier by nature, so it's wonderful to be able to see the progress of his flights. I've always been able to pull them up easily & they've always been accurate. Great job!

Summary: I keep tabs on my brother's flight info,  as he travels for work quite a bit.


Raw Text: This app makeup so easy to take great notes and I really like being able to record audio of someone speaking wile I&#39m typing notes!

Summary: This app makeup so easy to take great notes and I really like being able to record audio of someone speaking wile I&#39m typing notes!


Raw Text: Okay	This game is great. Can't what for the new one

Summary: Okay	This game is great.


Raw Text: Hk$7.7 only	So cheap!!!!

Summary: Hk$7.7 only	So ch

ValueError: max() arg is an empty sequence

In [None]:
summarize(df['stopwords_removal'])

In [None]:
summarize(df['lemmatized_comment'])

In [None]:
summarize(df['stemmed'])

In [None]:
summarize(df['stopwords_removal_nltk'])

In [None]:
summarize(df['stopwords_removal_lemmatization'])