# AI6122 Text Management Review Summarizer
Task List:
- [ ] Define and justify what a summarizer is. E.g.
  + a list of keywords
  + a list of key phrases
  + a list of noun-adjective pairs 
  + a list of nounPhrase - adjectivePhrase pairs 
  + a list of representative sentences 
- [ ] Technical challenges to achieve ideal summarization and your solution.
- [ ] Justify approach is best option for each component in your solution.
- [ ] Justify limitations to your approach.
- [ ] Evaluate solution with possible alternative solutions (baselines).
- [ ] Randomly choose 3 products to create product review summary. 



## Colab Configuration

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os 
os.chdir('/content/drive/My Drive/Colab Notebooks/txtmgmt/')
!pwd

/content/drive/My Drive/Colab Notebooks/txtmgmt


## Import Modules & Configurations

In [0]:
!pip install spacy

In [0]:
from collections import OrderedDict
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import multiprocessing as mp
import datetime
from helpers.duallogger import loggersetup
from helpers.filehelper import is_not_empty_file_exists, write_to_file, load_from_file
import logging

from nltk.corpus import stopwords
import nltk
import re
import heapq
import collections
import operator
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [0]:
nltk.download('punkt')
nltk.download('stopwords')
stopwords_nltk = stopwords.words('english')
stopwords_spacy = list(STOP_WORDS)
stopwords_spacy.append('\n')
stopwords = stopwords_nltk + list(set(stopwords_spacy) - set(stopwords_nltk))

print("sw nltk: ", len(stopwords_nltk))
print("sw spacy: ", len(stopwords_spacy))
print("combined: ", len(stopwords))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
sw nltk:  179
sw spacy:  327
combined:  383


In [0]:
cores = mp.cpu_count()
print("Cores:", cores)
gpu = spacy.prefer_gpu()
print("GPU:", gpu)

log_dir = './logs/'
log = loggersetup(log_dir, stdout_level=logging.DEBUG, file_level=logging.DEBUG)

# log.debug('Debug message, should only appear in the file.')
# log.info('Info message, should appear in file and stdout.')
# log.warning('Warning message, should appear in file and stdout.')
# log.error('Error message, should appear in file and stdout.')

Cores: 2
GPU: False


In [0]:
parameters = OrderedDict()
parameters['json_file'] = 'CellPhoneReview.json'
parameters['reload_prod_reviews'] = True
parameters['prod_reviews_path'] = './data/prod_reviews.data'
parameters['clean_reviews'] = True
parameters['reload_clean_reviews'] = True
parameters['cleaned_reviews_path'] = './data/prod_reviews_cleaned.data'
parameters['reload_collection_frequencies'] = True
parameters['collection_frequencies_path'] = './data/word_doc_frequencies.data'

## Data Preprocessing

* convert_lower_case(data)
* lemma(data)
* remove_punctuation(data)
* remove_stop_words(data)

In [0]:
if not parameters['reload_prod_reviews'] or not is_not_empty_file_exists(parameters['prod_reviews_path']):
    data = pd.read_json(parameters['json_file'], lines = True)
    prod_reviews = data.groupby(['asin'])['reviewText'].apply(' '.join).reset_index()
    log.info("Writing prod_reviews to %s" % parameters['prod_reviews_path'])
    write_to_file(parameters['prod_reviews_path'], prod_reviews)
else:
    log.info("Reloading prod_reviews from %s" % parameters['prod_reviews_path'])
    prod_reviews = load_from_file(parameters['prod_reviews_path'])

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)
# prod_reviews.head(3) 

[INFO] Reloading prod_reviews from ./data/prod_reviews.data


In [0]:
# Create the nlp object
nlp = spacy.load("en_core_web_sm")

print(nlp.pipe_names)

['tagger', 'parser', 'ner']


In [0]:
sentnlp = spacy.load("en_core_web_sm", disable=['parser', 'ner', 'tagger'])
sentnlp.add_pipe(nlp.create_pipe('sentencizer'))

# Define function to cleanup text by removing personal pronouns, stopwords, and punctuations
def cleanup_text(text, stopwords, punc):
    texts = []
    global sentnlp
    doc = sentnlp(text) # only do tokenization and pos tagging
    for sentence in doc.sents:
        sen_text = nlp(sentence.text, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in sen_text if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punc]
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return str(texts)

def tokenize_sentences(text): 
    texts = []
    global sentnlp
    doc = sentnlp(text) # only do tokenization and pos tagging
    for idx, sentence in enumerate(doc.sents):
        texts.append(sentence.text)
    return str(texts)

if parameters['clean_reviews']:
    if not parameters['reload_clean_reviews'] or not is_not_empty_file_exists(parameters['cleaned_reviews_path']):
        punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~©'
        log.debug("reviews to be stripped away with stopwords and punctation")
        prod_reviews['processedSentences'] = prod_reviews['reviewText'].apply(lambda x: cleanup_text(x, stopwords, punctuations))
        prod_reviews['reviewSentences'] = prod_reviews['reviewText'].apply(lambda x: tokenize_sentences(x))
        log.info("Writing cleaned reviews to %s" % parameters['cleaned_reviews_path'])
        write_to_file(parameters['cleaned_reviews_path'], prod_reviews)
    else:
        log.info("Reloading cleaned reviews from %s" % parameters['cleaned_reviews_path'])
        prod_reviews = load_from_file(parameters['cleaned_reviews_path'])
        
else:
    log.info("unprocessed reviews will be used")
    prod_reviews['reviewSentences'] = prod_reviews['reviewText'].apply(lambda x: tokenize_sentences(x))
    prod_reviews['processedSentences'] = prod_reviews['reviewSentences']

[INFO] Reloading cleaned reviews from ./data/prod_reviews_cleaned.data


In [0]:
print(prod_reviews['processedSentences'][7])
print(prod_reviews['reviewSentences'][7])

['great product order future .', 'money spend', 'need great car charger need', 'light port little rough charge phone fine .', 'wish little reach use phone passenger seat car comfortably big problem .', 'reach pretty far coil section stress charge port phone people realize eventually break .', 'purchase charger sister girlfriend .', 'samsung galaxy phone need car charger oem fanboy swoop couple remain slightly skeptical actually mail price .', 'happy report definitely receive 2 oem samsung car charger mail seal bag barcode kind normally receive new refurb phone .', 'wrong price certainly argue benefit oem vs. generic output reliability list .', 'buy samsung charge .', 'work perfectly build solidly 20 verizon charger .', 'highly recommend car charger thank great price', 'need pleased .', 'buy cheap lame .', 'charger work great .', 'issue .', 'wish retractable cord like charger use love .']
['This is another great product and i will always order from here in the future.', 'money spent wel

### Statistics

In [0]:
len_of_reviews = []
for index, row in prod_reviews.iterrows():
    len_of_reviews.append(len(row['reviewText']))

# print(len_of_reviews)
print("max len reviews: ", max(enumerate(len_of_reviews), key=(lambda x: x[1])))
print("min len reviews: ", min(enumerate(len_of_reviews), key=(lambda x: x[1])))


[987, 3055, 8737, 1546, 2601, 1044, 2809, 1594, 5286, 6421, 1145, 924, 1279, 2310, 1413, 5284, 1470, 2379, 2050, 1697, 2506, 2871, 3951, 4655, 1024, 4675, 2027, 23129, 5436, 3252, 13611, 3403, 7409, 1315, 13000, 6340, 3270, 6136, 6294, 6569, 2201, 3691, 7136, 24795, 10461, 3058, 12716, 2255, 4484, 5357, 1419, 5360, 32828, 33031, 7802, 14046, 2464, 6681, 2468, 14797, 2871, 5909, 17836, 7537, 856, 4221, 12838, 9358, 4278, 3261, 10272, 35690, 13419, 2009, 2875, 5043, 5916, 7122, 18592, 1970, 4958, 3667, 11916, 11827, 15578, 7076, 36989, 3015, 1873, 8841, 23967, 2369, 123938, 6587, 4184, 8726, 2876, 7447, 3107, 2407, 6709, 6950, 7313, 21558, 12508, 777, 1848, 6035, 11938, 9370, 20741, 2308, 1419, 4368, 2524, 8422, 10234, 4767, 16375, 17741, 1482, 2796, 8570, 25654, 3617, 9605, 7526, 3573, 6783, 10213, 10238, 11239, 7991, 1054, 3131, 2464, 6330, 2753, 11448, 4392, 11539, 4745, 3703, 4485, 1430, 4259, 3135, 1634, 3191, 12905, 1719, 3825, 6413, 1356, 14284, 1157, 8762, 4887, 13119, 9640, 3155

## Review Summarizer

### Collection Frequency

In [0]:
def collection_frequencies(prod_reviews, use_cleaned) -> dict:
    collection_frequencies = {}
    for index, row in prod_reviews.iterrows():
        seen_tokens = set()
        if (use_cleaned):
            reviews = ' '.join(str(x) for x in eval(row['processedSentences'])) 
        else:
            reviews = ' '.join(str(x) for x in eval(row['reviewSentences'])) 
        for token in nlp.make_doc(reviews):   #tokenize
            if token.text in seen_tokens:
                continue
            if token.text not in stopwords:
                if token.text not in collection_frequencies.keys():
                    collection_frequencies[token.text] = 1
                else:
                    collection_frequencies[token.text] += 1
            seen_tokens.add(token.text)
    return collection_frequencies

if not parameters['reload_collection_frequencies'] or not is_not_empty_file_exists(parameters['collection_frequencies_path']):
    word_doc_frequencies = collection_frequencies(prod_reviews, parameters['clean_reviews'])
    log.info("Writing collection frequencies to %s" % parameters['collection_frequencies_path'])
    write_to_file(parameters['collection_frequencies_path'], word_doc_frequencies)
else:
    log.info("Reloading collection frequencies from %s" % parameters['collection_frequencies_path'])
    word_doc_frequencies = load_from_file(parameters['collection_frequencies_path'])

# print(word_doc_frequencies)

Output hidden; open in https://colab.research.google.com to view.

In [0]:
len(word_doc_frequencies)

139844

### Tokenize Words & Calculate Word Frequency 

Only calculate word frequencies of non-stopwords



In [0]:
def _word_frequency(review) -> dict: 
  global stopwords 
  word_frequencies = {}  
  for token in nlp.make_doc(review):   #tokenize
      if token.text not in stopwords:
          if token.text not in word_frequencies.keys():
              word_frequencies[token.text] = 1
          else:
              word_frequencies[token.text] += 1
  return word_frequencies

# wf = _word_frequency(prod_reviews['reviewText'][8])
# print(wf)

### Cosine Similarity

In [0]:
def cos_sim(a, b):
  return dot(a, b)/(norm(a)*norm(b))

### Sentence-level Summarizer


In [0]:
  index = 10
  processedSentences = eval(prod_reviews['processedSentences'][index])
  reviewSentences = eval(prod_reviews['reviewSentences'][index])

### Vector Space Summarizer

In [0]:
def generate_summary(prod_reviews, index, num_of_docs, word_doc_frequencies, num_of_sentences):
    processedSentences = eval(prod_reviews['processedSentences'][index])
    reviewSentences = eval(prod_reviews['reviewSentences'][index])

    ## generate word frequencies in cleaned
    cleaned_reviews = ' '.join(str(x) for x in processedSentences) 
    word_frequencies = _word_frequency(cleaned_reviews)

    ## generate review vector of len of word frequencies
    vocab = len(word_frequencies)
    # log.debug("vocab: %i" % vocab)
    review_tf_vec = np.zeros(len(word_frequencies))
    for idx, word in enumerate(word_frequencies):
        tfidf_value = (1 + np.log(word_frequencies[word])) * np.log(num_of_docs/word_doc_frequencies[word]) 
        review_tf_vec[idx] = tfidf_value

    # print(review_tf_vec)
    # log.debug(len(review_tf_vec))

    ## generate sentences
    # log.debug("num of sentences: %i" % len(processedSentences))
    # log.debug("sentences: %s" % processedSentences)

    ## generate sentence vectors of len of word frequencies
    sent_tf_vecs = []
    for sentence in processedSentences:
        sent_tf_vec = np.zeros(len(word_frequencies))
        sent_word_frequencies = _word_frequency(sentence)
        for idx, word in enumerate(sent_word_frequencies):
            if word in word_frequencies.keys():
                idx = list(word_frequencies.keys()).index(word)
                tfidf_value = (1 + np.log(sent_word_frequencies[word])) * np.log(num_of_docs/word_doc_frequencies[word]) 
                sent_tf_vec[idx] = tfidf_value
        sent_tf_vecs.append(sent_tf_vec)
        # print('shape', sent_tf_vec.shape)

    ## compare cossim between review vector and sentence vectors
    cosine_similarity = []
    for sent_tf_vec in sent_tf_vecs:
        cosine_similarity.append(cos_sim(review_tf_vec, sent_tf_vec))

    highest_cos_idx = np.argsort(cosine_similarity)[-num_of_sentences:]
    # log.debug("highest_cos_idx: %s" % str(highest_cos_idx))

    ## original sentences
    # log.debug("original_sentences: %s" % reviewSentences)
    # log.debug("num of ori sentences: %s" % len(reviewSentences))
    summary_sentences = []
    for idx in highest_cos_idx:
        # log.debug("idx: %i ori: %s cleaned: %s" % (idx, reviewSentences[idx], processedSentences[idx]))
        # log.debug("cos_vec: %s" % str(sent_tf_vecs[idx]))
        summary_sentences.append(reviewSentences[idx])

    return " ".join(summary_sentences), summary_sentences

### KMeans Summarizer

In [0]:
###### Kmeans algorithm ################
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

def kmeans_summary(prod_reviews, index, num_of_docs, word_doc_frequencies, num_of_sentences):

    processedSentences = eval(prod_reviews['processedSentences'][index])
    reviewSentences = eval(prod_reviews['reviewSentences'][index])

    ## generate word frequencies in cleaned
    cleaned_reviews = ' '.join(str(x) for x in processedSentences) 
    word_frequencies = _word_frequency(cleaned_reviews)

    sent_tf_vecs = []
    for sentence in processedSentences:
        sent_tf_vec = np.zeros(len(word_frequencies))
        sent_word_frequencies = _word_frequency(sentence)
        for idx, word in enumerate(sent_word_frequencies):
            if word in word_frequencies.keys():
                idx = list(word_frequencies.keys()).index(word)
                tfidf_value = (1 + np.log(sent_word_frequencies[word])) * np.log(num_of_docs/word_doc_frequencies[word]) 
                sent_tf_vec[idx] = tfidf_value
        sent_tf_vecs.append(sent_tf_vec)

    est = KMeans(n_clusters=num_of_sentences, random_state=0).fit(np.array(sent_tf_vecs))

    closest_centers_idx = []
    indexes = np.arange(len(sent_tf_vecs))
    cluster_ids = est.labels_
    centroids = est.cluster_centers_

    # calculate each sentence's dist from its corresponding centroid 
    dist_from_centroids = []
    for idx in range(len(sent_tf_vecs)):
        dist = euclidean_distances(sent_tf_vecs[idx].reshape(1,-1), centroids[cluster_ids[[idx]], :])
        dist_from_centroids.append(dist[0][0])

    dist_arr = np.concatenate([np.array([dist_from_centroids]), cluster_ids.reshape(1,-1), indexes.reshape(1,-1)], axis = 0 )
    dist_arr = dist_arr.T

    # find sentences with shortest dist to centroids
    for cluster_id in range(len(centroids)):
        dist_arr_id = dist_arr[dist_arr[:, 1] == cluster_id]
        row_idx = dist_arr_id[:, 0].argmin()
        sent_idx = dist_arr_id[row_idx][-1]
        closest_centers_idx.append(sent_idx)
    closest_centers_idx = sorted(closest_centers_idx)
    # print('SELECTED SENT index:', closest_centers_idx)

    # print('\n')
    # print('[ORIGINAL]:')
    # for sent in reviewSentences:
    #     print(sent)

    #Comparing against vector space model
    # _, vec_sent = generate_summary(prod_reviews, index=index, num_of_docs=len(prod_reviews), word_doc_frequencies=word_doc_frequencies, num_of_sentences=num_of_sentences)
    # print('\n')
    # print('[SUMMARY Vector Space]:')
    # for i in vec_sent:
    #     print(i)

    summary_sent = []
    # print('\n')
    # print('[SUMMARY Kmeans]:')
    for idx in closest_centers_idx:
        # print(reviewSentences[int(idx)])
        summary_sent.append(reviewSentences[int(idx)])
    
    return summary_sent

### Structured POS Summarizer

In [0]:
##############preprocessing for stru pos summary#####################################

data = pd.read_json('CellPhoneReview.json', lines=True)
prod_ids  = data['asin'].values
reviewTexts  = data['reviewText'].values
unique_id = data['asin'].unique()

nlp = spacy.load("en_core_web_sm")
sentnlp = spacy.load("en_core_web_sm", disable=['parser', 'ner', 'tagger'])
sentnlp.add_pipe(nlp.create_pipe('sentencizer'))
punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~©'

def cleanup_text(text, stopwords, punc):
    texts = []
    global sentnlp
    doc = sentnlp(text) # only do tokenization and pos tagging
    for sentence in doc.sents:
        sen_text = nlp(sentence.text, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in sen_text if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punc]
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return str(texts)

def get_pos_tag(sent, noun_dict):
    '''identify nouns and adjs in each sent'''

    nouns = []
    adj = []
    for tok in sent:
#         print(tok, tok.pos_, tok.lemma_)
        if str(tok.pos_) == 'NOUN':
            try:
                noun_dict[str(tok.lemma_)] += 1
            except:
                noun_dict[str(tok.lemma_)] = 1
            nouns.append(str(tok.lemma_))
        elif str(tok.pos_) == 'ADJ':
            adj.append(str(tok.lemma_))
        
    return nouns, adj

In [0]:
def get_tf_idf(sentences):
    "get tf-idf vectors for sentences"
    num_of_docs = len(unique_id)
    sent_tf_vecs = []
    for sentence in sentences:
        sent_tf_vec = np.zeros(len(word_frequencies))
        sent_word_frequencies = _word_frequency(sentence)
        for idx, word in enumerate(sent_word_frequencies):
            if (word in word_doc_frequencies):
              idx = all_words_ls.index(word)
              tfidf_value = (1 + np.log(sent_word_frequencies[word])) * np.log(num_of_docs/word_doc_frequencies[word]) 
              sent_tf_vec[idx] = tfidf_value
        sent_tf_vecs.append(sent_tf_vec)
        
    return sent_tf_vecs

def get_center_vec(sent_tf_vecs):
    "get central sentence representation"
    est = KMeans(n_clusters=1, random_state=0).fit(np.array(sent_tf_vecs))

    closest_centers_idx = []
    indexes = np.arange(len(sent_tf_vecs))
    cluster_ids = est.labels_
    centroids = est.cluster_centers_

    # calculate each sentence's dist from its corresponding centroid 
    dist_from_centroids = []
    for idx in range(len(sent_tf_vecs)):
        dist = euclidean_distances(sent_tf_vecs[idx].reshape(1,-1), centroids[cluster_ids[[idx]], :])
        dist_from_centroids.append(dist[0][0])

    dist_arr = np.concatenate([np.array([dist_from_centroids]), cluster_ids.reshape(1,-1), indexes.reshape(1,-1)], axis = 0 )
    dist_arr = dist_arr.T

    # find sentences with shortest dist to centroids
    for cluster_id in range(len(centroids)):
        dist_arr_id = dist_arr[dist_arr[:, 1] == cluster_id]
        row_idx = dist_arr_id[:, 0].argmin()
        sent_idx = dist_arr_id[row_idx][-1]
        closest_centers_idx.append(sent_idx)

    print('SELECTED SENT index:', closest_centers_idx)

    best_vec = sent_tf_vecs[int(closest_centers_idx[0])]
    
    return best_vec, int(closest_centers_idx[0])

def uncapitalize(s):
  if len(s) > 0:
    s = s[0].lower() + s[1:]
  return s

In [0]:
#choose review index
#index 90 has issue
# index = 3
index = 5
# index = 6649 #max len
# index = 4570 
# index = 311
print("INDEX: ", index)

chosen_sent_idx = []

ori_sent = []
processed_sent = []
last_sent = []
nouns_master = []
adj_master = []
len_adj = []

uni_id = unique_id[index]

noun_dict = dict()
for idx in range(len(prod_ids)):
    if prod_ids[idx] == uni_id:
        text = reviewTexts[idx]
        sent_master = []
        doc = nlp(text)
        for sent in doc.sents:
#             print(sent)
            ori_sent.append(str(sent))
            clean_sent = cleanup_text(str(sent),stopwords, punctuations)
            processed_sent.append(eval(clean_sent)[0])
            nouns_list, adj_list = get_pos_tag(sent, noun_dict)
            nouns_master.append(nouns_list)
            adj_master.append(adj_list)
            len_adj.append(len(adj_list))
            last_sent.append(0)
#         print('\n')
        last_sent[-1] = 1
    else:
        continue
        
sentence_frame = pd.DataFrame({'ori_sent': ori_sent, 'processed_sent':processed_sent, 'nouns':nouns_master,
                 'adj':adj_master, 'len_adj': len_adj, 'last_sent':last_sent, 'idx_sent': np.arange(len(ori_sent))})
                              
                              
# get word freq of all words in reviews
cleaned_reviews = ' '.join(str(x) for x in list(sentence_frame['processed_sent'].values)) 
word_frequencies = _word_frequency(cleaned_reviews)
all_words_ls = list(word_frequencies.keys())
num_of_sentences = min(len(sentence_frame['processed_sent']), 5)
                      
########################### GET  1st SENTENCE (best sent that represents most occuring noun) #############################################
# find sentences with the most occuring noun
max_noun = max(noun_dict.items(), key=operator.itemgetter(1))[0]
clean_text_max_nouns = []
max_nouns_idx = []
for idx in range(len(sentence_frame)):
    if max_noun in sentence_frame['nouns'][idx]:
        clean_text_max_nouns.append(sentence_frame['processed_sent'][idx])
        max_nouns_idx.append(sentence_frame['idx_sent'][idx])
        
print('max_noun', max_noun)
                              
# construct tfidf of sentences with max noun
max_noun_tf_vecs = get_tf_idf(clean_text_max_nouns)
                              
#get central rep sent for max noun
bext_noun_vec, best_noun_sent_idx = get_center_vec(max_noun_tf_vecs)
chosen_idx = max_nouns_idx[best_noun_sent_idx]
chosen_sent_idx.append(chosen_idx)
num_of_sentences = num_of_sentences - 1                              

################################################### GET  2nd and 3rg SENTENCE #############################################
#############################################sents that contain adj and are similar to 1st sent###############################
# get best adj sents
clean_text_best_adjs = []
best_adjs_idx = []
for idx in range(len(sentence_frame)):
    if idx not in chosen_sent_idx and len(sentence_frame['adj'][idx]) >= 1 :
        clean_text_best_adjs.append(sentence_frame['processed_sent'][idx])
        best_adjs_idx.append(sentence_frame['idx_sent'][idx])

# construct tfidf of sentences with best adj
best_adj_tf_vecs = get_tf_idf(clean_text_best_adjs)
                              
# compute cos sim between best_adj_tf_vec and max_noun_vec, also compute mean tfidf score for each sent
adj_sim_dict = dict()
adj_tfidf_dict = dict()
for idx in range(len(best_adj_tf_vecs)):
    sim = cos_sim(best_adj_tf_vecs[idx], bext_noun_vec)
    adj_sim_dict[sim] = best_adjs_idx[idx]
    tfidf_score = np.mean(best_adj_tf_vecs[idx])
    adj_tfidf_dict[tfidf_score] = best_adjs_idx[idx]

#get idx of 2 sents with best scores  
for i in range(2):
    if num_of_sentences > 0 and len(adj_sim_dict) > 0:
      key = sorted(adj_sim_dict.keys())[-(i+1)]
      chosen_sent_idx.append(adj_sim_dict[key])
      num_of_sentences = num_of_sentences - 1 
                             
################################################### GET  4th SENTENCE based on pure tfidf score #############################################
################################################### This is to add some diversity/details to the summary###################################
while num_of_sentences > 0 and len(adj_tfidf_dict) > 0:
    i = -1
    key = sorted(adj_tfidf_dict.keys())[i]
    idx = adj_tfidf_dict[key]
    if idx in chosen_sent_idx:
        i -= 1
    else:
        chosen_sent_idx.append(idx)
        break
################################################### GET LAST SENTENCE #############################################
################################################### Most representative concluding sentence#######################################

# get last sents
last_sents = []
last_sent_idx = []
for idx in range(len(sentence_frame)):
    if idx not in chosen_sent_idx and sentence_frame['last_sent'][idx] == 1 :
        last_sents.append(sentence_frame['processed_sent'][idx])
        last_sent_idx.append(sentence_frame['idx_sent'][idx])
                              
#get central rep sent for best last sentence
last_sents_tf_vecs = get_tf_idf(last_sents)
_, best_last_sent_idx = get_center_vec(last_sents_tf_vecs)
chosen_idx = last_sent_idx[best_last_sent_idx]
chosen_sent_idx.append(chosen_idx)
                              
print('\n')
print('[ORIGINAL]:')
max_ori = 100
full_summary = []
for i in sentence_frame['ori_sent'].values:
  if max_ori > 0:
      print(i.strip() + " ")
      full_summary.append(i.strip() + " ")
      max_ori = max_ori - 1

print('\n')
print('[ORIGINAL ONELINE]:')
print("".join(full_summary))

num_of_sentences = min(len(sentence_frame['processed_sent']), 5)
_, summary_vector_space = generate_summary(prod_reviews, index=index, num_of_docs=len(prod_reviews), word_doc_frequencies=word_doc_frequencies, num_of_sentences=num_of_sentences)
summary_kmeans = kmeans_summary(prod_reviews, index=index, num_of_docs=len(prod_reviews), word_doc_frequencies=word_doc_frequencies, num_of_sentences=num_of_sentences)

print('\n')
print('[VECTOR SPACE SUMMARY]:')
for i in summary_vector_space:
    print(i.strip().capitalize() + " ")

print('\n')
print('[KMEANS SUMMARY]:')
for i in summary_kmeans:
    print(i.strip().capitalize() + " ")

print('\n')
print('[STRUC POS SUMMARY]:')
for idx in chosen_sent_idx:
    if idx == chosen_sent_idx[-1]:
        last_sent = sentence_frame['ori_sent'][idx]
        last_sent = last_sent[0].lower() + last_sent[1:]
        print('Overall, ' + uncapitalize(sentence_frame['ori_sent'][idx].strip()) + " ")
    else:
        print(sentence_frame['ori_sent'][idx].strip().capitalize() + " ")

print('\n')
print('[RANDOM CHOICE SUMMARY]:')
from random import choice
prod_reviews_texts = eval(prod_reviews['reviewSentences'][index])
range_of_indices = list(range(len(prod_reviews_texts)))
summary_sentences = []
for i in range(num_of_sentences):
    randIdx = choice(range_of_indices)
    summary_sentences.append(prod_reviews_texts[randIdx])
    range_of_indices.remove(randIdx)

for i in summary_sentences:
    print(i.strip().capitalize() + " ")

INDEX:  5
max_noun charger
SELECTED SENT index: [0.0]
SELECTED SENT index: [1.0]


[ORIGINAL]:
very good charger 
, it woks fine, no complaints!!! 
I would recommend it!! 
It was good, but not using it anymore! 
This product arrived when promised and in the condition promised. 
It is a genuine Blackberry charger. 
It was brand new and worked perfectly. 
For the price, I don't think you can ask for much more than that! 
Great product. 
Use this with my Galaxy S4. 
At this price I bought 2 more. 
One for work and one to keep in the car 
incase 
I ever travel and forget my charger. 
Honestly I have always loved this specific model, since you can just take it with you on the road not worrying if you end up losing it or leaving it where ever you go. 
Though it doesn't exactly meet the fast charging needs of high end phones, it works pretty well on all of them, you just need to be patient. 
I've had this charger well over a year now, and it still works perfectly. 
Most chargers would have br

In [0]:
np.mean(best_adj_tf_vecs[0])

0.007771149751624102