In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources if not already installed
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

print("Imported")

Imported


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gkath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gkath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gkath\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
# text = """ “Upon learning of the former president's request, we approached it through the lens of our core values: we open our doors to everyone,” McDonald's said in its message.
# McDonald's said franchisees — who independently own and operate more than 95 per cent of US locations — have also invited Harris and her running mate Tim Walz for a visit.
# Trump has sought to discredit, without evidence, Harris' claim about her job at the Golden Arches in the 1980s. McDonald's said that neither corporate nor franchisees have records for all positions going back to that time.
# The Wall Street Journal earlier reported on the message. """

text='''Delhi Police have sought information from Telegram regarding the "Justice League India" channel after a blast occurred near the CRPF School in Rohini, Delhi. The explosion damaged property but caused no injuries. Telegram has yet to respond, and investigations are ongoing, with no group claiming responsibility.

The inquiry follows an explosion outside the CRPF School in Prashant Vihar, Rohini, where a post containing CCTV footage of the incident was shared on the channel. The police are also looking into other social media platforms.

On Sunday, a mysterious explosion rocked the vicinity of the school, damaging the school wall, nearby shop windows, and a parked car. Fortunately, no injuries were reported. Police received a PCR call about the blast at approximately 7:47 AM and quickly responded to the scene.

Upon arrival, officers discovered a damaged school wall emitting a foul odor, along with shattered windows and a damaged vehicle. Investigations are ongoing, exploring all possible angles.



'''
text = text.replace("\n"," ")
documentspreprocessedData = text.split(".")
print(documentspreprocessedData)

['Delhi Police have sought information from Telegram regarding the "Justice League India" channel after a blast occurred near the CRPF School in Rohini, Delhi', ' The explosion damaged property but caused no injuries', ' Telegram has yet to respond, and investigations are ongoing, with no group claiming responsibility', '  The inquiry follows an explosion outside the CRPF School in Prashant Vihar, Rohini, where a post containing CCTV footage of the incident was shared on the channel', ' The police are also looking into other social media platforms', '  On Sunday, a mysterious explosion rocked the vicinity of the school, damaging the school wall, nearby shop windows, and a parked car', ' Fortunately, no injuries were reported', ' Police received a PCR call about the blast at approximately 7:47 AM and quickly responded to the scene', '  Upon arrival, officers discovered a damaged school wall emitting a foul odor, along with shattered windows and a damaged vehicle', ' Investigations are o

Preprocessing

In [11]:
def preprocess_text(text, use_stemming=False, use_lemmatization=True):
    # 1. Lowercase the text
    text = text.lower()

    # 2. Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # 3. Tokenize the text
    tokens = word_tokenize(text)

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Stemming or Lemmatization
    if use_stemming:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    elif use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Example usage

# preprocessed_text = preprocess_text(text)
preprocessedData = [preprocess_text(sentence) for sentence in documentspreprocessedData]
print(preprocessedData)

['delhi police sought information telegram regarding justice league india channel blast occurred near crpf school rohini delhi', 'explosion damaged property caused injury', 'telegram yet respond investigation ongoing group claiming responsibility', 'inquiry follows explosion outside crpf school prashant vihar rohini post containing cctv footage incident shared channel', 'police also looking social medium platform', 'sunday mysterious explosion rocked vicinity school damaging school wall nearby shop window parked car', 'fortunately injury reported', 'police received pcr call blast approximately 747 quickly responded scene', 'upon arrival officer discovered damaged school wall emitting foul odor along shattered window damaged vehicle', 'investigation ongoing exploring possible angle', '']


TF-IDF Ranking

In [12]:
import math
from collections import Counter

In [13]:
def compute_tf(sentence):
    words = sentence.split()
    word_count = len(words)
    tf_values = Counter(words)
    for word in tf_values:
        tf_values[word] = tf_values[word] / word_count
    return tf_values

def compute_idf(sentences):
    num_sentences = len(sentences)
    idf_values = {}
    all_words = set(word for sentence in sentences for word in sentence.split())

    for word in all_words:
        containing_sentence_count = sum(1 for sentence in sentences if word in sentence.split())
        idf_values[word] = math.log(num_sentences / (1 + containing_sentence_count))  # Adding 1 to avoid division by zero

    return idf_values

def compute_tfidf(sentences):
    tf_list = [compute_tf(sentence) for sentence in sentences]

    idf_values = compute_idf(sentences)

    tfidf_list = []
    for tf_values in tf_list:
        tfidf = {}
        for word, tf_value in tf_values.items():
            tfidf[word] = tf_value * idf_values.get(word, 0)
        tfidf_list.append(tfidf)

    return tfidf_list

def score_sentences(sentences, tfidf_list):
    sentence_scores = []
    for i, tfidf in enumerate(tfidf_list):
        score = sum(tfidf.values())  # Sum up the TF-IDF values of all words in the sentence
        sentence_scores.append((sentences[i], score))

    sentence_scores.sort(key=lambda x: x[1], reverse=True)

    return sentence_scores


tfidf_list = compute_tfidf(preprocessedData)

ranked_sentences = score_sentences(preprocessedData, tfidf_list)

for sentence, score in ranked_sentences:
    print(f"Sentence: {sentence} | Score: {score}")


Sentence: police received pcr call blast approximately 747 quickly responded scene | Score: 1.5948868633716144
Sentence: police also looking social medium platform | Score: 1.5892235621451007
Sentence: fortunately injury reported | Score: 1.5695930562023703
Sentence: telegram yet respond investigation ongoing group claiming responsibility | Score: 1.5526986766978637
Sentence: investigation ongoing exploring possible angle | Score: 1.5425620489951595
Sentence: upon arrival officer discovered damaged school wall emitting foul odor along shattered window damaged vehicle | Score: 1.5355380146179711
Sentence: inquiry follows explosion outside crpf school prashant vihar rohini post containing cctv footage incident shared channel | Score: 1.5281335149410131
Sentence: delhi police sought information telegram regarding justice league india channel blast occurred near crpf school rohini delhi | Score: 1.490820830298724
Sentence: sunday mysterious explosion rocked vicinity school damaging school 

Paraphrasing

In [14]:
# pip install git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

In [15]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")

In [16]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

In [28]:
# parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

# phrases = ["trump sought discredit without evidence harris claim job golden arch 1980s"]
phrases = [text[0] for text in ranked_sentences]
# print(phrases1)
# phrases = ["Can you recommend some upscale restaurants in Newyork?",
#            "What are the famous places we should not miss in Russia?"
# ]
# print(phrases)

content = []
for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = parrot.augment(input_phrase=phrase, use_gpu=True)
  for para_phrase in para_phrases:
      print(para_phrase)
      content.append(para_phrase[0])

----------------------------------------------------------------------------------------------------
Input_phrase:  police received pcr call blast approximately 747 quickly responded scene
----------------------------------------------------------------------------------------------------
('police received pcr call blast approximately 747 quickly responded scene', 0)
----------------------------------------------------------------------------------------------------
Input_phrase:  police also looking social medium platform
----------------------------------------------------------------------------------------------------
('police also looking social medium platform', 0)
----------------------------------------------------------------------------------------------------
Input_phrase:  fortunately injury reported
----------------------------------------------------------------------------------------------------
('fortunately injury reported', 0)
----------------------------------------

TypeError: 'NoneType' object is not iterable

In [27]:
# print(content)
text = ". ".join(content)
# print(content)
print(text)

police received pcr call blast approximately 747 quickly responded scene. police also looking social medium platform. fortunately injury reported. telegram yet respond investigation ongoing group claiming responsibility. investigation ongoing exploring possible angle. upon arrival officer discovered damaged school wall emitting foul odor along shattered window damaged vehicle. inquiry follows explosion outside crpf school prashant vihar rohini post containing cctv footage incident shared channel. delhi police sought information telegram regarding justice league india channel blast occurred near crpf school rohini delhi. sunday an explosion rocked a nearby school damaging a school wall nearby a shop window a parked car. explosion damaged property caused injury


In [31]:
# parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

# phrases = ["trump sought discredit without evidence harris claim job golden arch 1980s"]
phrases = [text[0] for text in ranked_sentences]

# print(phrases)
for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = parrot.augment(input_phrase=phrase,
                               use_gpu=False,
                               diversity_ranker="levenshtein",
                               do_diverse=False,
                               max_return_phrases = 10,
                               max_length=32,
                               adequacy_threshold = 0.99,
                               fluency_threshold = 0.90)

  for para_phrase in para_phrases:
   print(para_phrase)

----------------------------------------------------------------------------------------------------
Input_phrase:  trump sought discredit without evidence harris claim job golden arch 1980s
----------------------------------------------------------------------------------------------------
('trump sought discredit without evidence harris claim job golden arch 1980s', 0)
----------------------------------------------------------------------------------------------------
Input_phrase:  wall street journal earlier reported message
----------------------------------------------------------------------------------------------------
('wall street journal earlier reported message', 0)
----------------------------------------------------------------------------------------------------
Input_phrase:  upon learning former president request approached lens core value open door everyone mcdonalds said message
----------------------------------------------------------------------------------------

KeyboardInterrupt: 