## Imports

In [11]:
import numpy as np
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import sent_tokenize

## Read dataset

In [2]:
df = pd.read_csv('../../WikiHow-Dataset/wikihowAll.csv', delimiter=',')

In [3]:
df.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


In [4]:
df = df.dropna()

## Text preprocessing

In [5]:
headlines = []
articles = []
    
def preprocess_data(data_frame, doc_count = 100):
    i = 0

    for index, row in data_frame.iterrows():
        abstract = row['headline']
        article = row['text']
        
        if i > doc_count:
            break
    
        if isinstance(article, str) and isinstance(abstract, str):
            #  a threshold is used to remove short articles with long summaries as well as articles with no summary
            if len(abstract) < (0.75 * len(article)):
                # remove extra commas in abstracts
                abstract = re.sub(r'[.]+[,]+[\n]', ".\n", abstract)
                abstract = abstract.replace(".,", ".")
                # remove extra commas in articles
                article = re.sub(r'[.]+[\n]+[,]', ".\n", article)
                
                headlines.append(abstract)
                articles.append(article)
                i+=1

In [6]:
def sent_tokenize_summaries(summary):
    summary = re.sub(r'[.]+[,]+[\n]', ".\n", summary)
    return sent_tokenize(summary)

## LSA algorithm

In [7]:
def lsa(text, n = 3):
    summaries = []
    preprocessed_sentences = sent_tokenize(text)

    vectorizer = CountVectorizer(stop_words = 'english')
    bag_of_words = vectorizer.fit_transform(preprocessed_sentences)

    svd = TruncatedSVD(n_components = n)
    lsa = svd.fit_transform(bag_of_words)

    i = 0
    while i < n:
        iterator = list(map(lambda lsa_scores: lsa_scores[i], lsa))
        max_value = max(iterator)
        index = iterator.index(max_value)
        sentence = preprocessed_sentences[index]
        summaries.append(sentence)
        i += 1

    return summaries

In [8]:
def summarize(text, n = 3):
    return lsa(text, n)

## Evaluation

In [9]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu_score(reference_summaries, generated_summaries):
    bleu_scores = []
    for reference_summary, generated_summary in zip(reference_summaries, generated_summaries):
        reference_summary = ' '.join(reference_summary)
        generated_summary = ' '.join(generated_summary)
        bleu_scores.append(sentence_bleu([reference_summary], generated_summary))
        
    return bleu_scores

## Generate and evaluate summaries on the WikiHow dataset¶

In [12]:
i = 100

preprocess_data(df, i)

print("Total number of documents: ", i)

Total number of documents:  100


In [13]:
reference_summaries = [sent_tokenize_summaries(summary) for summary in headlines]
print("Generating summaries")

generated_summaries = [summarize(text) for text in articles]

print("Evaluating BLEU scores")
# Evaluate the generated summaries using the BLEU score
bleu_scores = evaluate_bleu_score(reference_summaries, generated_summaries)

# Average BLEU score
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print("Average BLEU score:", avg_bleu_score)

Generating summaries
Evaluating BLEU scores
Average BLEU score: 0.2538879568251714


## Demo

In [14]:
print(headlines[0])
print("-------------------------------")
print(articles[0])


Keep related supplies in the same area.
Make an effort to clean a dedicated workspace after every session.
Place loose supplies in large, clearly visible containers.
Use clotheslines and clips to hang sketches, photos, and reference material.
Use every inch of the room for storage, especially vertical space.
Use chalkboard paint to make space for drafting ideas right on the walls.
Purchase a label maker to make your organization strategy semi-permanent.
Make a habit of throwing out old, excess, or useless stuff each month.
-------------------------------
 If you're a photographer, keep all the necessary lens, cords, and batteries in the same quadrant of your home or studio. Paints should be kept with brushes, cleaner, and canvas, print supplies should be by the ink, etc. Make broader groups and areas for your supplies to make finding them easier, limiting your search to a much smaller area. Some ideas include:


Essential supplies area -- the things you use every day.
Inspiration and 

In [15]:
reference_summaries = [sent_tokenize_summaries(df['headline'][0])]
generated_summaries = [summarize(df['text'][0])]

print(reference_summaries)
print("---------------------------------------------")
print(generated_summaries)
print("---------------------------------------------")
# Evaluate the generated summaries using the BLEU score
bleu_score = evaluate_bleu_score(reference_summaries, generated_summaries)[0]
print('BLEU score:', bleu_score)

[['\nKeep related supplies in the same area.', 'Make an effort to clean a dedicated workspace after every session.', 'Place loose supplies in large, clearly visible containers.', 'Use clotheslines and clips to hang sketches, photos, and reference material.', 'Use every inch of the room for storage, especially vertical space.', 'Use chalkboard paint to make space for drafting ideas right on the walls.', 'Purchase a label maker to make your organization strategy semi-permanent.', 'Make a habit of throwing out old, excess, or useless stuff each month.']]
---------------------------------------------
[["Cheap and easy, this is also a good way to handle papers and ideas you touch regularly or need to pin up and down for inspiration., Shelving is an artist's best friend and is a cheap and easy way to get more room in your studio or art space.", 'The upper reaches of the room are often the most under-utilized, but provide vital space for all your tools and materials., Turning one wall into a 