In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt') # one time execution
import re
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /home/vasil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('../../WikiHow-Dataset/wikihowAll.csv', delimiter=',')

In [3]:
df.head()

Unnamed: 0,headline,title,text
0,"\nKeep related supplies in the same area.,\nMa...",How to Be an Organized Artist1,"If you're a photographer, keep all the necess..."
1,\nCreate a sketch in the NeoPopRealist manner ...,How to Create a Neopoprealist Art Work,See the image for how this drawing develops s...
2,"\nGet a bachelor’s degree.,\nEnroll in a studi...",How to Be a Visual Effects Artist1,It is possible to become a VFX artist without...
3,\nStart with some experience or interest in ar...,How to Become an Art Investor,The best art investors do their research on t...
4,"\nKeep your reference materials, sketches, art...",How to Be an Organized Artist2,"As you start planning for a project or work, ..."


In [10]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2023-02-06 22:51:25--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-02-06 22:51:26--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-02-06 22:51:27--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/vasil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def preprocess_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in stop_words and word.isalpha()]
    return text

In [6]:
def similarity_matrix(documents):
    similarity = np.zeros((len(documents), len(documents)))
    for i in range(len(documents)):
        for j in range(len(documents)):
            if i != j:
                similarity[i][j] = cosine_similarity(documents[i].reshape(1,100), documents[j].reshape(1,100))[0,0]
    return similarity

In [7]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [8]:
def generate_sentence_vectors(preprocessed_sentences):
    sentence_vectors = []
    for i in preprocessed_sentences:
      if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i])/(len(i)+0.001)
      else:
        v = np.zeros((100,))
      sentence_vectors.append(v)
    return sentence_vectors

In [9]:
def text_rank(text):
    sentences = sent_tokenize(text)
    preprocessed_sentences = [preprocess_text(sentence) for sentence in sentences]
    sentence_vectors = generate_sentence_vectors(preprocessed_sentences) 
    similarity = similarity_matrix(sentence_vectors)
    graph = nx.from_numpy_array(similarity)
    scores = nx.pagerank(graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    return ranked_sentences

In [10]:
def summarize(text, n=7):
    ranked_sentences = text_rank(text)
    return [sentence for score, sentence in ranked_sentences[:n]]

In [11]:
def evaluate_rouge(reference_summaries, generated_summaries):
    rouge_scores = []
    for reference_summary, generated_summary in zip(reference_summaries, generated_summaries):
        reference_summary = set(reference_summary)
        generated_summary = set(generated_summary)
        rouge_scores.append(len(reference_summary & generated_summary) / len(reference_summary | generated_summary))
    return np.mean(rouge_scores)

In [12]:
from rouge import Rouge

def evaluate_rouge_score(reference_summaries, generated_summaries):
    rouge_scores = []
    for reference_summary, generated_summary in zip(reference_summaries, generated_summaries):
        reference_summary = ' '.join(reference_summary)
        generated_summary = ' '.join(generated_summary)
        rouge_scores.append(Rouge().get_scores(generated_summary, reference_summary, avg=True))
    return rouge_scores    


In [13]:
def sent_tokenize_summaries(summary):
    summary = re.sub(r'[.]+[,]+[\n]', ".\n", summary)
    return sent_tokenize(summary)


In [14]:
headlines = []
articles = []
i = 0
for index, row in df.iterrows():
    if i >= 5:
        break
    abstract = row['headline']
    article = row['text']
    i+=1
    if isinstance(article, str) and isinstance(abstract, str):
        if len(abstract) < (0.75 * len(article)):
            # remove extra commas in abstracts
            abstract = re.sub(r'[.]+[,]+[\n]', ".\n", abstract)
            abstract = abstract.replace(".,", ".")
            # remove extra commas in articles
            article = re.sub(r'[.]+[\n]+[,]', ".\n", article)
            
            headlines.append(abstract)
            articles.append(article)

In [18]:
reference_summaries = [sent_tokenize_summaries(summary) for summary in headlines]
print("Generating summaries")

generated_summaries = [summarize(text) for text in articles]

print("Evaluating rouge scores")
# Evaluate the generated summaries using the ROUGE score
rouge_scores = evaluate_rouge_score(reference_summaries, generated_summaries)

total_precision = 0
highest_precision = 0

total_recall = 0
highest_recall = 0

total_f = 0
highest_f = 0

for k in rouge_scores:
    total_precision += k['rouge-1']['p']
    total_recall += k['rouge-1']['r']
    total_f += k['rouge-1']['f']
    
    if k['rouge-1']['p'] > highest_precision:
        highest_precision = k['rouge-1']['p']
    if k['rouge-1']['r'] > highest_recall:
        highest_recall = k['rouge-1']['r']
    if k['rouge-1']['f'] > highest_f:
        highest_f = k['rouge-1']['f']
    # print(k)
print('average rouge score precision:', total_precision / 5)
print('average rouge score recall:', total_recall / 5)
print('average rouge score f :', total_f / 5)

Generating summaries
Evaluating rouge scores
average rouge score precision: 0.17207013947594763
average rouge score recall: 0.3604788014169995
average rouge score f : 0.22896376363718915


### Tests

In [94]:
print(headlines[0])
print(df['headline'][0])


Keep related supplies in the same area.
Make an effort to clean a dedicated workspace after every session.
Place loose supplies in large, clearly visible containers.
Use clotheslines and clips to hang sketches, photos, and reference material.
Use every inch of the room for storage, especially vertical space.
Use chalkboard paint to make space for drafting ideas right on the walls.
Purchase a label maker to make your organization strategy semi-permanent.
Make a habit of throwing out old, excess, or useless stuff each month.

Keep related supplies in the same area.,
Make an effort to clean a dedicated workspace after every session.,
Place loose supplies in large, clearly visible containers.,
Use clotheslines and clips to hang sketches, photos, and reference material.,
Use every inch of the room for storage, especially vertical space.,
Use chalkboard paint to make space for drafting ideas right on the walls.,
Purchase a label maker to make your organization strategy semi-permanent.,
Make

In [46]:
reference_summaries = [sent_tokenize_summaries(df['headline'][0])]
generated_summaries = [summarize(df['text'][0])]

print(df['headline'][0])
print("---------------------------------------------")
print(df['text'][0])
print("---------------------------------------------")
print(reference_summaries)
print("---------------------------------------------")
print(generated_summaries)
print("---------------------------------------------")
# Evaluate the generated summaries using the ROUGE score
rouge_score = evaluate_rouge_score(reference_summaries, generated_summaries)
print('ROUGE score:', rouge_score)

[" If you're a photographer, keep all the necessary lens, cords, and batteries in the same quadrant of your home or studio. Paints should be kept with brushes, cleaner, and canvas, print supplies should be by the ink, etc. Make broader groups and areas for your supplies to make finding them easier, limiting your search to a much smaller area. Some ideas include:", "\nEssential supplies area -- the things you use every day.\nInspiration and reference area.\nDedicated work area .\nInfrequent or secondary supplies area, tucked out of the way.;\n, This doesn't mean cleaning the entire studio, it just means keeping the area immediately around the desk, easel, pottery wheel, etc. clean each night. Discard trash or unnecessary materials and wipe down dirty surfaces. Endeavor to leave the workspace in a way that you can sit down the next day and start working immediately, without having to do any work or tidying.", '\nEven if the rest of your studio is a bit disorganized, an organized workspac