In [None]:
'''
extractive sumarization using unsupervised learning 
i use cosine similarity because of chosen approach (sentences as vectors)
cosine similarity measures the angle between two vectors 
if the angle is 0 the sentences are similar 

Input article → split into sentences → remove stop words → build a similarity matrix → 
generate rank based on matrix → pick top N sentences for summary.

'''

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jolajakobowska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def prepare_article(file_name):
    file = open(file_name, "r")
    file_data = file.readlines() # full text of the article - 
    article = file_data[0].split(". ")
    sentences = []
    
    for sentence in article:
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
        
    sentences.pop()
    print("Given text: \n", file_data, "\n")
    return sentences 
    

In [3]:
#creates vectors from sentences and returns cosine distances extracted from 1 to later get the similarity 
def vectors_from_sentences(sentence_1, sentence_2, stopwords=None):
    if stopwords is None:
        stopwords = []
        
    sentence_1 = [w.lower() for w in sentence_1] #w - word
    sentence_2 = [w.lower() for w in sentence_2]
    
    everything = list(set(sentence_1 + sentence_2))
    
    vector_1 = [0] *len(everything)
    vector_2 = [0] *len(everything)
    
    #first sentence 
    for w in sentence_1:
        if w in stopwords:
            continue 
        vector_1[everything.index(w)] += 1 
  
    #second sentence
    for w in sentence_2:
        if w in stopwords:
            continue 
        vector_2[everything.index(w)] += 1
  
    return 1 - cosine_distance(vector_1, vector_2)

In [4]:
#create a matrix based on cosine distance
def do_matrix_of_similarity(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences))) #get an array of zeros (empty) to compare two sentences
    
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue 
            similarity_matrix[idx1][idx2] = vectors_from_sentences(sentences[idx1],sentences[idx2])
            
    return similarity_matrix

In [5]:
def give_summary(file_name, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []
    
    #prepare the data
    sentences = prepare_article(file_name) 
    
    #make the matrix of vectors and return cosine distance
    sentence_simmilarity_matrix = do_matrix_of_similarity(sentences, stop_words) 
    
    sentence_simmilarity_graph = nx.from_numpy_array(sentence_simmilarity_matrix)
    scores = nx.pagerank(sentence_simmilarity_graph)
    
    ranked_sentence = sorted(((scores[i], s) for i,s in enumerate(sentences)), reverse=True)
    
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))
        
    ready_summary = ". ".join(summarize_text)
    print("Summarry: \n", ready_summary)


In [6]:
give_summary( "test_text.txt", 2)

Given text: 
 ['Financial markets come in a variety of flavors to accommodate the wide array of financial instruments or securities that have been found beneficial to both borrowers and lenders over the years. Primary markets are where newly created (issued) instruments are sold for the first time. Most securities are negotiable. In other words, they can be sold to other investors at will in what are called secondary markets. Stock exchanges, or secondary markets for ownership stakes in corporations called stocks (aka shares or equities), are the most well-known type, but there are also secondary markets for debt, including bonds (evidences of sums owed, IOUs), mortgages, and derivatives and other instruments. Not all secondary markets are organized as exchanges, centralized locations, like the New York Stock Exchange or the Chicago Board of Trade, for the sale of securities. Some are over-the-counter (OTC) markets run by dealers connected via various telecom devices (first by post and

In [7]:
give_summary( "test_text2.txt", 1)

Given text: 
 ["This article, delves into the topic of Support Vector Machines(SVM) in Machine Learning, covering the different types of SVM algorithms and how they function. SVM is a widely used supervised machine learning algorithm that can tackle classification and regression problems. To understand how SVM works, we first need to comprehend what supervised means — it involves providing label data as input to the algorithm. Classification problems, where the target has a finite number of possibilities, are tackled with SVM. For instance, if you want to determine whether the mail is a scam, there are only two possibilities. On the other hand, regression problems deal with continuous target labels. For example, if you need to predict an employee's salary increase based on their performance, the salary increase would be continuous. SVM algorithms are generally utilized for classification challenges in machine learning. The objective of the SVM algorithm is to create a decision boundary