**This Colab file has been adapted from the original Feature Matrix Generation.ipynb. It facilitates extractive text summarization for a provided input paragraph, intended for seamless integration with Streamlit.**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import gensim.downloader as api

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
word_embeddings_model = api.load("glove-wiki-gigaword-100")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.




In [None]:
def extract_tfidf(text):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([text])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names


In [None]:

def extract_word_embeddings(text):
    words = word_tokenize(text.lower())
    word_vectors = []
    for word in words:
        if word in word_embeddings_model:
            word_vectors.append(word_embeddings_model[word])
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(word_embeddings_model.vector_size)



In [None]:

def extract_sentence_embeddings(text):
    sentences = sent_tokenize(text)
    sentence_vectors = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        word_vectors = []
        for word in words:
            if word in word_embeddings_model:
                word_vectors.append(word_embeddings_model[word])
        if word_vectors:
            sentence_vectors.append(np.mean(word_vectors, axis=0))
    if sentence_vectors:
        return np.mean(sentence_vectors, axis=0)
    else:
        return np.zeros(word_embeddings_model.vector_size)


In [None]:

def perform_pos_tagging(text):
    words = word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    return pos_tags



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_title_content_similarity(title, content):
    if not title or not content:
        print("Error: Title or content is empty.")
        return 0

    try:
        vectorizer = CountVectorizer().fit([title, content])
        title_vec, content_vec = vectorizer.transform([title, content]).toarray()

        similarity_score = cosine_similarity([title_vec], [content_vec])[0][0]
        return similarity_score

    except Exception as e:
        print("Error computing similarity:", e)
        return 0


In [None]:
def score_calculation(text, title):
  results = {}

  # Calculate Title-Content Similarity Score
  similarity_score = compute_title_content_similarity(title, text)
  results['similarity_score'] = similarity_score

  try:
    # Extract TF-IDF features
    tfidf_matrix, _ = extract_tfidf(text)
    results['tfidf_matrix'] = tfidf_matrix.toarray()
  except Exception as tfidf_error:
    results['tfidf_error'] = str(tfidf_error)

  try:
    # Extract Word Embeddings
    word_embeddings_vector = extract_word_embeddings(text)
    results['word_embeddings_vector'] = word_embeddings_vector
  except Exception as word_embeddings_error:
    results['word_embeddings_error'] = str(word_embeddings_error)

  try:
    # Extract Sentence Embeddings
    sentence_embeddings_vector = extract_sentence_embeddings(text)
    results['sentence_embeddings_vector'] = sentence_embeddings_vector
  except Exception as sentence_embeddings_error:
    results['sentence_embeddings_error'] = str(sentence_embeddings_error)

  try:
    # Perform POS Tagging
    pos_tags = perform_pos_tagging(text)
    results['pos_tags'] = pos_tags
  except Exception as pos_tagging_error:
    results['pos_tagging_error'] = str(pos_tagging_error)

  return results


In [None]:

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from collections import Counter

def sentence_position_score(total_sentences):
    return [(1 - (i / total_sentences)) for i in range(total_sentences)]

def proper_noun_score(text):
    tagged_words = pos_tag(word_tokenize(text))
    proper_nouns = [word for word, pos in tagged_words if pos == 'NNP']
    return len(proper_nouns) / len(tagged_words)

def compute_scores(text):
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    total_words = len(word_tokenize(text))
    position_scores = sentence_position_score(total_sentences)
    noun_score = proper_noun_score(text)
    return position_scores, noun_score


In [None]:
def sentence_scores(text):
  position_scores, noun_score = compute_scores(text)
  avg_position_score = sum(position_scores) / len(position_scores)
  avg_noun_score = noun_score
  print("Average Sentence Position Score:", avg_position_score)
  print("Average Proper Noun Score:", avg_noun_score)
  print()
  return avg_position_score,avg_noun_score

In [None]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize

def feature_matrix_generation(title, text):
    title_similarity_list = []
    position_scores_list = []
    sentence_length_list = []
    proper_noun_score_list = []

    title_similarity = compute_title_content_similarity(title, text)
    title_similarity_list.append(title_similarity)

    sentences = sent_tokenize(text)
    total_sentences = len(sentences)

    avg_position_score, avg_noun_score = sentence_scores(text)

    total_words = len(word_tokenize(text))
    avg_sentence_length = total_words / total_sentences

    position_scores = sentence_position_score(total_sentences)
    avg_position_score = np.mean(position_scores)

    noun_score = proper_noun_score(text)

    position_scores_list.append(avg_position_score)
    sentence_length_list.append(avg_sentence_length)
    proper_noun_score_list.append(noun_score)

    title_similarity_array = np.array(title_similarity_list)
    position_scores_array = np.array(position_scores_list)
    sentence_length_array = np.array(sentence_length_list)
    proper_noun_score_array = np.array(proper_noun_score_list)

    feature_matrix = np.column_stack((title_similarity_array, position_scores_array, sentence_length_array, proper_noun_score_array))

    print("Feature Matrix:")
    print(feature_matrix)

    return feature_matrix


In [None]:
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

def sentence_position_score(total_sentences):
    return [(1 - (i / total_sentences)) for i in range(total_sentences)]

def proper_noun_score(text):
    tagged_words = pos_tag(word_tokenize(text))
    proper_nouns = [word for word, pos in tagged_words if pos == 'NNP']
    return len(proper_nouns) / len(tagged_words) if len(tagged_words) > 0 else 0

def compute_title_content_similarity(title, content):
    return len(set(title.split()) & set(content.split())) / max(len(set(title.split())), 1)

def compute_sentence_matrix(paragraph):
    sentences = sent_tokenize(paragraph)
    total_sentences = len(sentences)
    title_similarity = 1.0

    if total_sentences > 0:
        sentence_position_scores = sentence_position_score(total_sentences)
        sentence_proper_noun_scores = [proper_noun_score(sentence) for sentence in sentences]
        sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]

        title_similarity_list = [title_similarity] * total_sentences
        position_scores_array = np.array(sentence_position_scores)
        sentence_length_array = np.array(sentence_lengths)
        proper_noun_score_array = np.array(sentence_proper_noun_scores)

        sentence_matrix = np.column_stack((title_similarity_list, position_scores_array, sentence_length_array, proper_noun_score_array))
        return sentence_matrix
    else:
        return None

paragraph = """The internet has revolutionized the way we communicate, access information, and conduct business. With billions of users worldwide, it has become an essential part of modern life. From social media platforms connecting people across the globe to e-commerce websites offering a vast array of products and services, the internet has transformed various aspects of society. However, this rapid expansion and reliance on digital technology have also brought challenges such as cybersecurity threats, privacy concerns, and digital divide issues. As we continue to embrace the digital age, it's crucial to address these challenges while harnessing the immense potential of the internet for positive change and innovation."""

sentence_matrix = compute_sentence_matrix(paragraph)
if sentence_matrix is not None:
    print("Sentence Matrix:")
    print(sentence_matrix)
else:
    print("No sentences found in the paragraph.")


Sentence Matrix:
[[ 1.   1.  16.   0. ]
 [ 1.   0.8 16.   0. ]
 [ 1.   0.6 30.   0. ]
 [ 1.   0.4 27.   0. ]
 [ 1.   0.2 30.   0. ]]


In [None]:
def rbm(sentence_matrix):

    import numpy as np
    from sklearn.neural_network import BernoulliRBM

    n_components = 150
    learning_rate = 0.1
    n_iter = 70
    enhanced_feature_matrices = []

    for sample in sentence_matrix:
        rbm1 = BernoulliRBM(n_components=n_components, random_state=0)
        rbm1.fit(sample.reshape(1, -1))
        s_prime = rbm1.transform(sample.reshape(1, -1))

        rbm2 = BernoulliRBM(n_components=n_components,learning_rate = learning_rate, n_iter = n_iter, random_state=1)
        rbm2.fit(s_prime)
        s_double_prime = rbm2.transform(s_prime)

        enhanced_feature_matrices.append(s_double_prime)

    enhanced_feature_matrix = np.vstack(enhanced_feature_matrices)

    return enhanced_feature_matrix


In [None]:
import numpy as np
from nltk.tokenize import sent_tokenize

def summary_generator(text, title, N=5):
    feature_matrix = feature_matrix_generation(title, text)

    if feature_matrix.ndim == 1:
        feature_matrix = feature_matrix.reshape(1, -1)

    sentence_matrix = compute_sentence_matrix(text)
    enhanced_feature_matrix = rbm(sentence_matrix)

    if enhanced_feature_matrix is None or enhanced_feature_matrix.ndim != 2:
        print("Error: Enhanced feature matrix is not in the expected 2D format.")
        return None

    feature_sums = np.sum(enhanced_feature_matrix, axis=1)
    sorted_indices = np.argsort(feature_sums)[::-1]

    selected_indices = sorted_indices[:N]
    selected_indices = np.sort(selected_indices)
    sentences = sent_tokenize(text)

    summary = ""
    selected_count = 0

    for index in selected_indices:
        if selected_count < N:
            if index < len(sentences):
                summary += sentences[index] + "\n"
                selected_count += 1
            else:
                print("Index out of range:", index)
        else:
            break


    return summary

title = "The Impact of the Internet on Modern Society"
text = """The internet has revolutionized the way we communicate, access information, and conduct business. With billions of users worldwide, it has become an essential part of modern life. From social media platforms connecting people across the globe to e-commerce websites offering a vast array of products and services, the internet has transformed various aspects of society. However, this rapid expansion and reliance on digital technology have also brought challenges such as cybersecurity threats, privacy concerns, and digital divide issues. As we continue to embrace the digital age, it's crucial to address these challenges while harnessing the immense potential of the internet for positive change and innovation.
"""

generated_summary = summary_generator(text, title, N=3)
print("Generated Summary:", generated_summary)


Average Sentence Position Score: 0.6
Average Proper Noun Score: 0.0

Feature Matrix:
[[ 0.5  0.6 23.8  0. ]]
Generated Summary: From social media platforms connecting people across the globe to e-commerce websites offering a vast array of products and services, the internet has transformed various aspects of society.
However, this rapid expansion and reliance on digital technology have also brought challenges such as cybersecurity threats, privacy concerns, and digital divide issues.
As we continue to embrace the digital age, it's crucial to address these challenges while harnessing the immense potential of the internet for positive change and innovation.

