In [4]:
import pandas as pd
from nltk import tokenize
df = pd.read_csv('sample_text.csv')

In [5]:
ls_text = []

for i in range(len(df)):
    ls_text.append(tokenize.sent_tokenize(df['context'][i]))

# Sentence Transformers method 1

In [35]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')

ans_1 = []
for k in range(len(df)):
    #Compute embedding for both lists
    embeddings1 = model.encode(ls_text[k], convert_to_tensor=True)

    a = np.matmul(embeddings1,embeddings1.T)
    a = np.abs(a).numpy()
    a = np.sum(a, axis = 1)
    temp = []
    for i in np.argpartition(a, -10)[-10:]:
        # print(ls_text[k][i])
        temp.append(ls_text[k][i])
    ans_1.append('. '.join(temp))


    #Compute cosine-similarities
    # cosine_scores = util.cos_sim(embeddings1, embeddings2)

    # #Output the pairs with their score
    # for i in range(len(sentences1)):
    #     print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))

In [36]:
ans_1

['This type of symbiosis is relatively uncommon in rudimentary reference texts, but is omnipresent in the natural world.. An example of mutual symbiosis is the relationship between the ocellaris clownfish that dwell among the tentacles of Ritteri sea anemones.. Another non-obligate symbiosis is known from encrusting bryozoans and hermit crabs that live in a close relationship.. Symbiotic relationships include those associations in which one organism lives on another (ectosymbiosis, such as mistletoe), or where one partner lives inside the other (endosymbiosis, such as lactobacilli and other bacteria in humans or Symbiodinium in corals).. While historically, symbiosis has received less attention than other interactions such as predation or competition, it is increasingly recognized as an important selective force behind evolution, with many species having a long history of interdependent co-evolution.. Many biologists restrict the definition of symbiosis to close mutualist relationships

# Sentence Transformers method 2

In [37]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

ans_2 = []
for i in range(len(df)):
    sents = ls_text[i]
    
    # Query sentences:
    queries = [df['title'][i]]
    corpus_embeddings = embedder.encode(sents, convert_to_tensor=True)



    # Find the closest 10 sentences of the corpus for each query sentence based on cosine similarity

    top_k = min(10, len(sents))
    for query in queries:
        query_embedding = embedder.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 1- scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 10 most similar sentences in corpus:")
        
        temp = []
        for score, idx in zip(top_results[0], top_results[1]):
            # print(sents[idx], "(Score: {:.4f})".format(score))
            temp.append(sents[idx])
        ans_2.append('. '.join(temp))





Query: Symbiosis

Top 10 most similar sentences in corpus:
The definition of symbiosis has varied among scientists. (Score: 0.7749)
Symbiosis (from Greek σύν "together" and βίωσις "living") is close and often long-term interaction between two different biological species. (Score: 0.7628)
This type of symbiosis is relatively uncommon in rudimentary reference texts, but is omnipresent in the natural world. (Score: 0.7476)
Some believe symbiosis should only refer to persistent mutualisms, while others believe it should apply to any type of persistent biological interaction (in other words mutualistic, commensalistic, or parasitic). (Score: 0.6967)
Many biologists restrict the definition of symbiosis to close mutualist relationships. (Score: 0.6674)
Symbiosis played a major role in the co-evolution of flowering plants and the animals that pollinate them. (Score: 0.6501)
This is also known as antagonistic or antipathetic symbiosis. (Score: 0.6460)
Symbiosis is also classified by physica

# TF-IDF method

In [38]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    lemma = WordNetLemmatizer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = lemma.lemmatize(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [39]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [40]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [41]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [42]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [43]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [44]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [45]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    ls = np.argpartition(list(sentenceValue.values()), -10)[-10:]
    threshold = list(sentenceValue.values())[ls[0]]
    # ans = []
    # for i in ls:
    #     ans.append(sentences)
    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [46]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mihir\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [47]:
import math

from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords    
    
'''
We already have a sentence tokenizer, so we just need 
to run the sent_tokenize() method to create the array of sentences.
'''

ans_3 = []

for i in range(len(ls_text)):

    sentences = ls_text[i]
    # 1 Sentence Tokenize
    # sentences = sent_tokenize(text)
    total_documents = len(sentences)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    #print(freq_matrix)

    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    #print(sentence_scores)

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, threshold)
    # print(summary)
    ans_3.append(summary)

In [51]:
df['sentence_transformer_method1'] = ans_1
df['sentence_transformer_method2'] = ans_2
df['tf_idf'] = ans_3
df.to_csv('sample_text_results_1.csv', index=False)

In [51]:
from summa import keywords

In [52]:
TR_keywords = keywords.keywords(summary, scores=True)
print(TR_keywords[0:10])

[('adult', 0.3380449104576557), ('theme', 0.21790166957939905), ('themes', 0.21790166957939905), ('united', 0.21790166957939883), ('originally', 0.19383195948940687), ('original seven', 0.19383195948940676), ('bros', 0.1836023327011601), ('harry', 0.18360233270115994), ('film', 0.17862367456062173)]


In [6]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.5.1.tar.gz (19 kB)
Collecting rich>=10.4.0
  Downloading rich-12.5.1-py3-none-any.whl (235 kB)
Collecting commonmark<0.10.0,>=0.9.0
  Using cached commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
Building wheels for collected packages: keybert
  Building wheel for keybert (setup.py): started
  Building wheel for keybert (setup.py): finished with status 'done'
  Created wheel for keybert: filename=keybert-0.5.1-py3-none-any.whl size=21332 sha256=7f4da0a0827a80cbf122ed41c197f6d28d1deb5a869f7f4cff100b95bf72ddd3
  Stored in directory: c:\users\mihir\appdata\local\pip\cache\wheels\94\18\2a\f26bbcd25924aab452bb4bcc2345a55c07160823d196a264c7
Successfully built keybert
Installing collected packages: commonmark, rich, keybert
Successfully installed commonmark-0.9.1 keybert-0.5.1 rich-12.5.1


In [7]:
from keybert import KeyBERT

In [8]:
kw_model = KeyBERT(model='all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [49]:
keywords = kw_model.extract_keywords(summary, 

                                     keyphrase_ngram_range=(1, 3), 

                                     stop_words='english', 

                                     highlight=False,

                                     top_n=10)

keywords_list= list(dict(keywords).keys())

print(keywords_list)

['potter cursed child', 'harry potter cursed', 'rowling main theme', 'potter cursed', 'story written rowling', 'cursed child play', 'harry potter', 'cursed child', 'original seven books', 'written rowling']


# cdj

In [10]:

#### Pytextrank to find important phrases

import spacy
import pytextrank

# example text
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_lg")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")
doc = nlp(text)

# examine the top-ranked phrases in the document
for phrase in doc._.phrases:
    print(phrase.text)
    print(phrase.rank, phrase.count)
    print(phrase.chunks)

mixed types
0.18359439311764025 1
[mixed types]
systems
0.1784796193107821 3
[systems, systems, systems]
minimal generating sets
0.15037838042245094 1
[minimal generating sets]
nonstrict inequations
0.14740065982407313 1
[nonstrict inequations]
strict inequations
0.13946027725597837 1
[strict inequations]
linear Diophantine equations
0.1195023546245721 1
[linear Diophantine equations]
natural numbers
0.11450088293222845 1
[natural numbers]
solutions
0.10780718173686318 3
[solutions, solutions, solutions]
linear constraints
0.10529828014583348 1
[linear constraints]
all the considered types systems
0.1036960590708142 1
[all the considered types systems]
a minimal supporting set
0.08812713074893187 1
[a minimal supporting set]
a system
0.08243620500315359 1
[a system]
a minimal set
0.07944607954086784 1
[a minimal set]
algorithms
0.0763527926213032 1
[algorithms]
all types
0.07593126037016427 1
[all types]
Diophantine
0.07309361902551355 1
[Diophantine]
construction
0.0702090100898443 1
