In [3]:
# Importing Packages

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import requests
import pandas as pd
import io

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jules\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Import data

url = "https://raw.githubusercontent.com/julesz12345/Text-Summarizer/main/articles.csv"
download = requests.get(url).content
articles = pd.read_csv(io.StringIO(download.decode('utf-8')))

In [5]:
# Reading Article and Splitting it into Sentences

def read_article(articles):
    article = articles.split(". ")
    sentences = []
    for sentence in article:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    return sentences

In [10]:
# Finding Similarity between Sentences

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    return 1 - cosine_distance(vector1, vector2)

In [23]:
# Building Similarity Matrix

def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences))) 
    for sent1 in range(len(sentences)):
        for sent2 in range(len(sentences)):
            if sent1 == sent2:
                continue 
            similarity_matrix[sent1][sent2] = sentence_similarity(sentences[sent1], sentences[sent2], stop_words)

    return similarity_matrix

In [30]:
# Extract Summary

def generate_summary(articles, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(articles)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)       
    for i in range(top_n):
      summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize texr
    print("\n Summarized Text: \n \n", ". ".join(summarize_text))

In [36]:
# Summarize Article

choice = input("Choose a number between 1 and 13368 or paste full article: ")
if len(choice) > 6:
    text = articles['article'][int(choice)]
else:
    text = choice
print ("\n Full Article: \n \n" + text)
generate_summary(text, 1)

Choose a number between 1 and 13368 or paste full article: 12

 Full Article: 
 
12


IndexError: list index out of range

In [39]:
type(int(choice))

int