In [19]:
import re
import nltk
import math
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
nltk.download('punkt_tab')
import statistics 


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rajpu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
def clean_data(filename):
    # Open and read the entire file content
    with open(filename, "r",encoding='unicode-escape') as file:
        filedata = file.read()

    # Split the content into sentences
    article = re.split(r'\.\s+', filedata)

    # Initialize an empty list to hold cleaned sentences
    sent = []

    for sentence in article:
        # Remove non-alphabetic characters and extra spaces
        sentence = re.sub('[^a-zA-Z\s]', "", sentence)
        sentence = re.sub('\s+', " ", sentence).strip()
        if sentence:  # Append only non-empty sentences
            sent.append(sentence)

    # Join the cleaned sentences into a single string
    sent.pop()
    datas = " ".join(sent)
    print("Initial text:")
    print(filedata)
    print('\n')
    print("Cleaned text:")
    print(datas)
    print('\n')

    return sent,filedata

  sentence = re.sub('[^a-zA-Z\s]', "", sentence)
  sentence = re.sub('\s+', " ", sentence).strip()


In [21]:
def count(sent):
    cnt=0
    words=word_tokenize(sent)
    for word in words:
        cnt+=1
    return cnt

def cnt_inSent(sent):
    txt_data=[]
    i=0
    for s in sent:
        i+=1
        data=count(s)
        temp={"id":i, "word_cnt":data}
        txt_data.append(temp)
    return txt_data





In [22]:
def freq_dict(sent):
    i=0
    freq_list=[]
    for s in sent:
        i+=1
        freq={}
        words=word_tokenize(s)
        for char in words:
            char=char.lower()
            if char in freq:
                freq[char]+=1
            else:
                freq[char]=1
            temp={"id":1,"freq_dict":freq}
        freq_list.append(temp)
    return freq_list

In [23]:
def calculate_tf(txt_data,freq_list):
    tf_score=[]
    for item in freq_list:
        ID=item["id"]
        for k in item["freq_dict"]:
            temp={"id":ID,"key":k,"tf_score":item["freq_dict"][k]/txt_data[ID-1]["word_cnt"]}
            tf_score.append(temp)
    return  tf_score

In [24]:
def calculate_idf(txt_data, freq_list):
    idf_score = []
    cnt = 0
    for item in freq_list:
        cnt += 1
        for k in item["freq_dict"]:
            val = sum([k in it["freq_dict"] for it in freq_list])
            temp = {
                "id": cnt,
                "idf_score": math.log((len(txt_data) / (val + 1)) + 1),  # Ensure valid log value
                "key": k
            }
            idf_score.append(temp)
    return idf_score

In [25]:
def tfXidf(idf_score,tf_score):
    tf_idf_score=[]
    for j in idf_score:
        for i in tf_score:
            if j["key"]==i["key"] and j["id"]==i["id"]:
                temp={
                    "id":j["id"],
                    "tfXidf":i["tf_score"]*j["idf_score"],
                    "key":j["key"]
                }
                tf_idf_score.append(temp)
    return tf_idf_score

In [26]:
def ranking(tf_idf_score,sent,txt_data):
    # calculate the ranking of the sentence
    sent_data=[]
    for txt in txt_data:
        score=0
        for i in range(len(tf_idf_score)):
            t_dict=tf_idf_score[i]
            if txt["id"]==t_dict["id"]:
                score+=t_dict["tfXidf"]
        temp={
            "id":txt["id"],
            "score":score,
            "sentence":sent[txt["id"]-1]
        }
        sent_data.append(temp)
    return sent_data


In [27]:
def summary(sent_data):
    cnt=0
    summary=[]
    arr=[]
    for t_dict in sent_data:
        cnt+=t_dict['score']
    avg=cnt/len(sent_data)
    for  temp in sent_data:
        arr.append(temp['score'])
    stdev=statistics.stdev(arr)

    for sent in sent_data:
        if sent['score'] >=(avg + 3*stdev):
            summary.append(sent["sentence"])


    summary=" ".join(summary)
    return summary


In [28]:
tz=Tokenizer()


In [29]:
  
def complete_summary(sent_data, incomplete_summary):
    tz.fit_on_texts([sent_data])
    length = len(tz.word_index) + 1
    input_seq = []
    
    # Generating input sequences
    for sent in sent_data.split(" "):
        token_sent = tz.texts_to_sequences([sent])[0]
        for i in range(1, len(token_sent)):
            input_seq.append(token_sent[:i+1])
    
    max_len = max([len(x) for x in input_seq])
    padded_input_sequences = pad_sequences(input_seq, maxlen=max_len, padding='pre')
    X = padded_input_sequences[:, :-1]
    y = padded_input_sequences[:, -1]
    y = to_categorical(y, num_classes=length)
    z = X.shape[1]
    
    # Model definition
    model = Sequential()
    model.add(Embedding(input_dim=length, output_dim=100, input_length=z))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(length, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Model training
    model.fit(X, y, epochs=100)
    
    last = incomplete_summary[len(incomplete_summary) - 1]
    arr = [incomplete_summary]
    
    # Predicting new words
    while True:
        token_text = tz.texts_to_sequences([incomplete_summary])[0]
        padded_token_text = pad_sequences([token_text], maxlen=max_len, padding='pre')
        
        # Predict the next word
        pos = np.argmax(model.predict(padded_token_text))
        
        # Find the word corresponding to the predicted index
        for word, index in tz.word_index.items():
            if index == pos:
                last = word
                arr.append(word)
                print(word)
                
                # Exit the loop if the predicted word is a period (".")
                if word == ".":
                    break
                
                # Add the new word to the incomplete summary for the next prediction
                incomplete_summary += " " + word
        
        # Break the outer loop if the word is "."
        if last == ".":
            break
    
    # Return the completed summary
    completed_summ = " ".join(arr)
    return completed_summ



    

In [30]:
# Example of correct function calls assuming tf and idf are values
file,clean_text = clean_data(r"G:\text_summarier-main\text_summarier-main\text.txt")
text_data = cnt_inSent(file)
freq = freq_dict(file)

# Ensure tf and idf are calculated and not called as functions
tf_scores = calculate_tf(text_data, freq)
idf_scores = calculate_idf(text_data, freq)

# Multiplying TF and IDF
tfidf_scores = tfXidf(idf_scores, tf_scores)

# Ranking and summarizing
rankings = ranking(tfidf_scores, file, text_data)
result = summary(rankings)
print("Summary:")
print(result +".")


Initial text:
The flickering streetlight outside cast shadows on the walls, dancing to the rhythm of the autumn wind. The city was quiet, save for the occasional hum of distant traffic and the whispers of leaves brushing against the pavement.

It had been five years since Samira left her homeland, a small village nestled in the mountains, for the sprawling metropolis where opportunities were supposed to be endless. Back then, she had dreams of building a life that her family could be proud of. But the city, with its cold towers and relentless pace, had a way of swallowing dreams whole. The promises of success turned into long hours at a job that drained her spirit, far from the warmth and familiarity of home.

Tonight was different, though. Tonight, she was going back.

Samiraâs phone buzzed with a notification, pulling her out of her thoughts. It was from her mother, a short but heartfelt message, âWeâre waiting for you.â Her heart swelled with emotion. It had been too long si

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import numpy as np

def clean_and_tokenize(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    return " ".join([word for word in words if word.lower() not in stop_words])

def summarize_text(text, num_sentences=3):
    # Clean the text
    clean_text = clean_and_tokenize(text)
    
    # Split into sentences
    sentences = sent_tokenize(clean_text)
    
    # Calculate TF-IDF scores
    vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)
    sentence_scores = np.array(tfidf_matrix.sum(axis=1)).ravel()
    
    # Rank sentences by score
    ranked_sentences = [sent for sent, score in sorted(zip(sentences, sentence_scores), key=lambda x: x[1], reverse=True)]
    
    # Extract top sentences
    extractive_summary = " ".join(ranked_sentences[:num_sentences])
    
    # Optionally, use BERT for refinement
    summarizer = pipeline("summarization")
    abstractive_summary = summarizer(extractive_summary, max_length=27, min_length=20, do_sample=False)
    
    return abstractive_summary[0]['summary_text']

# Example usage
text = clean_text
summary = summarize_text(text)
print(summary)


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


 train finally arrived station, saw mother, brother, father, waiting open arms . long since seen family, felt embrace mother
