# 1.Extractive Summarization

### a>Sentence Scoring based on Word Frequency

In [18]:
import numpy as np
import pandas as pd
import re
import os
from tqdm import tqdm_notebook
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
##
stop_words.remove('no')
stop_words.remove('not')
from pickle import dump,load
import contractions
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer


In [3]:
reviews = pd.read_csv('Reviews.csv')
reviews.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [4]:
print(reviews.isnull().sum())

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [5]:
reviews.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time'], axis =1, inplace = True)

reviews.dropna(inplace = True)
reviews.head(2)

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


#### Tokenize the sentences

In [12]:
def create_frequency_table(text_string):
    '''
    Create the count of the no of words in the review
    '''
    ps = PorterStemmer()
    frequency_table = dict()
    words = word_tokenize(text_string)
    for word in words:
        word = ps.stem(word)
        if word in stop_words:
            continue
        elif word in frequency_table:
            frequency_table[word] +=1
        else:
            frequency_table[word]  =1
    return frequency_table

In [42]:
def score_sentences(sentences, frequency_table):
    '''
    Score each of the sentences of the review
    '''
    ps = PorterStemmer()
    sentence_value = dict()
    for sentence in sentences:
        sentence_length = len(word_tokenize(sentence))
        
        for word in word_tokenize(sentence):
            if ps.stem(word.lower()) in frequency_table:
                if sentence[:10] in sentence_value:
                    sentence_value[sentence[:10]] += frequency_table[ps.stem(word.lower())]
                else:
                    sentence_value[sentence[:10]]  = frequency_table[ps.stem(word.lower())]
        ## To take care of scenario where sentence is too long. I am taking only the dirst 10 words
        sentence_value[sentence[:10]] = sentence_value[sentence[:10]]/sentence_length ##float division
    return sentence_value

In [44]:
def find_average_score(sentence_value):
    '''
    Find average value of one review
    '''
    
    ## Here I wil divide by no of sentences in one review.
    ## So that I take care of scenario where reviews has too many lines
    sum_values = 0
    for _,count in sentence_value.items():
        sum_values += count
    avg_sum_values = sum_values/len(sentence_value)
    return avg_sum_values
    

In [45]:
def generate_summary(sentences, sentence_value, threshold):
    '''
    Generate Summary
    '''
    sentence_count = 0
    summary = ' '
    for sentence in sentences:
        if sentence[:10] in sentence_value and sentence_value[sentence[:10]] > threshold:
            summary = summary + sentence
            sentence_count += 1
    return summary

In [52]:
for i, review in enumerate(reviews['Text']):
    frequency_table = create_frequency_table(review)
    sentences  = sent_tokenize(review)
    sentence_value = score_sentences(sentences, frequency_table)
    threshold = find_average_score(sentence_value)
    ## here I am selecting the sentences where the score is > 1.1 times the average
    summary = generate_summary(sentences, sentence_value, threshold*1.1) 
    print('Text   :')
    print(review)
    print('Summary:')
    print(summary)
    print('*****************************************************************************')
    if i ==5:
        break

Text   :
I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
Summary:
 My Labrador is finicky and she appreciates this product better than  most.
*****************************************************************************
Text   :
Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
Summary:
 Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted.
*****************************************************************************
Text   :
This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and 

### b>Text Rank Algorithm

In [65]:
import numpy as np
import pandas as pd
import nltk
##nltk.download('punkt')
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import contractions
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.remove('no') ## reverify
stop_words.remove('not') ## reverify
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [54]:
reviews = pd.read_csv('Reviews.csv')
reviews.head(2)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [55]:
print(reviews.isnull().sum())

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64


In [56]:
reviews.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time'], axis =1, inplace = True)

reviews.dropna(inplace = True)
reviews.head(2)

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [57]:
## Using 100D Glove Vectors

word_embeddings = {}
glove_dimension = 100
filename = "D:\Project data\glove_vectors\glove.6B.100d.txt"

with open(filename, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype= 'float32')
            if coefs.shape[0] == glove_dimension:
                word_embeddings[word] = coefs
            else:
                print(word,'  :Embedding Length Exception')
        except:
            print(word,'  :Embedding DataType Exception')
        
print('Total number of words in the Glove Embedding:', len(word_embeddings))

Total number of words in the Glove Embedding: 400000


In [66]:
def clean_text(sentences):
    clean_text = contractions.fix(sentences) 
    clean_text = clean_text.lower()
    clean_text = re.sub(r'[^a-z]', ' ', clean_text)
    clean_text = ' '.join([words for words in clean_text.split() if words not in stop_words])
    clean_text = clean_text.strip()
    return clean_text

In [67]:
## mean of the words vector for each sentences
def vectorize_sentence(clean_sentences):
    sentence_vector = [] 
    for sentence in clean_sentences:
        if len(sentence) != 0:
            vector = np.sum([word_embeddings.get(word, np.zeros((100,))) for word in sentence.split() ], axis =0)
            vector = vector/(len(sentence.split()) + .0001)  ##mean
        else:
            vector = np.zeros((100,))
        sentence_vector.append(vector)
    return sentence_vector

In [68]:
def similarity_matrix(sentence, clean_sentences, sentence_vector):
    ## Similarity Matrix

    sim_matrix = np.zeros([len(clean_sentences), len(clean_sentences)])

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            sim_matrix[i][j] = cosine_similarity(sentence_vector[i].reshape(1,100), sentence_vector[j].reshape(1,100))[0,0]
    return sim_matrix

In [69]:
def page_rank_scores(sim_matrix):
    #Node is the sentence and the transition probability is the similarity matrix value

    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    return scores

In [70]:
def get_ranked_sentences(scores,sentences):
    ranked_sentences = []
    ##ranked_sentences =[(scores[i],s) for i,s in enumerate(sentences) ]
    for i, s in enumerate(sentences):
        #print(scores[i])
        #print(s)
        ranked_sentences.append((scores[i],s))

    ranked_sentences = sorted(ranked_sentences, reverse = True)
    return ranked_sentences

In [94]:
for i, review in enumerate(reviews['Text']):
    sentences = []
    #for sentence in  sent_tokenize(review):
        #sentences.append(sentence)
    sentences = sent_tokenize(review)
    clean_sentences = [ clean_text(sentence) for sentence in sentences]
    sentence_vector = vectorize_sentence(clean_sentences)
    sim_matrix = similarity_matrix(sentence, clean_sentences, sentence_vector)
    scores = page_rank_scores(sim_matrix)
    ranked_sentences = get_ranked_sentences(scores,sentences)
    print('Text   :')
    print(review)
    print('Summary:')
    print(ranked_sentences[0][0])
    print(ranked_sentences[0][1])
    print('*****************************************************************************')
    if i ==5:
        break

Text   :
I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
Summary:
0.3407312779435272
The product looks more like a stew than a processed meat and it smells better.
*****************************************************************************
Text   :
Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
Summary:
0.5000000181226637
Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted.
*****************************************************************************
Text   :
This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filber

### c>Unsupervised Learning using Skip-Thought Vectors