In [None]:
# Tokenize for Validation, annual_reports folder

In [5]:
import os
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# Specifying the subfolders
annual_reports_folder = 'annual_reports'


# Initialize an empty list to store the DataFrame for each file
df_list = []

# Looping through each file in the annual reports folder
# Give correct folder path for validation folder
for annual_report_file_name in os.listdir(f"/home/student/Desktop/FNS_dataset/validation/{annual_reports_folder}"):
    # Constructing the file paths
    annual_report_path = f"/home/student/Desktop/FNS_dataset/validation/{annual_reports_folder}/{annual_report_file_name}"

    # Extract the base name without extension (e.g., '10' from '10.txt', in below [0] has root name and [1] will have extension part)
    base_name = os.path.splitext(annual_report_file_name)[0]

   

    # Reading the annual report text into a string
    with open(annual_report_path, 'r', encoding='utf-8') as file:
        annual_report_text = file.read()


        # Tokenizing the text into sentences
        annual_report_sentences = sent_tokenize(annual_report_text)
        

    # Filter out sentences with length less than 5
    filtered_sentences = [sentence for sentence in annual_report_sentences if len(word_tokenize(sentence)) >= 5]
   
    # Create a DataFrame with the filtered sentences and labels columns
    df = pd.DataFrame({'sentence': filtered_sentences})

    preprocessed_output_path = '/home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output'
    os.makedirs(preprocessed_output_path, exist_ok=True)

    # Save the DataFrame to a CSV file with an escape character '|'
    output_file_name = f"{base_name}_tokenized.csv"
    output_file_path = f"/home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/{output_file_name}"
    df.to_csv(output_file_path, index=False, escapechar='|')

    # Display the processed(here labelling) DataFrame
    print(f"Processed data saved to {output_file_path}")

    # Append the DataFrame to the list
    df_list.append(df)

# Concatenate all DataFrames in the list
final_df = pd.concat(df_list, ignore_index=True)

# Save the final DataFrame to a CSV file
final_output_file_path = "/home/student/Desktop/NLP_Output/final_validation_sen_tokenize_data_combined.csv"
final_df.to_csv(final_output_file_path, index=False, escapechar='|')

# Display the final processed DataFrame
print(f"Final processed data saved to {final_output_file_path}")


Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/32020_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/30962_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/32717_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/32248_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/31817_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/32771_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/31678_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_output/31242_tokenized.csv
Processed data saved to /home/student/Desktop/NLP_Output/final_validation_sen_tokenize_o

In [None]:
# Preprocessing on tokenized validation data

In [6]:
import os
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import nltk
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Specify the subfolders
# labelled_output_folder is where sentences of anuual_reports are stores(this is output o first part(sentence tokenization of documents) of validation annual_reports)
labelled_output_folder = 'final_validation_sen_tokenize_output'
preprocessed_output_folder = 'final_validation_preprocess_output'

# Create the preprocessed output folder if it doesn't exist,
preprocessed_output_path = f"/home/student/Desktop/NLP_Output/{preprocessed_output_folder}"
os.makedirs(preprocessed_output_path, exist_ok=True)

# Punctuation and Special Character Removal 
def remove_punctuation(text):
    return ''.join(char for char in text if char.isalnum() or char.isspace())


lemmatizer = WordNetLemmatizer()

# POS Tagging
def pos_tag_sentence(sentence):
    try:
        tokens = word_tokenize(sentence)
        pos_tags = pos_tag(tokens)
        return pos_tags
    except Exception as e:
        print(f"Error in pos_tag_sentence: {e}")
        return []
# Lemmatization
def lemmatize_sentence(pos_tags):
    try:
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word, tag in pos_tags if word.lower() not in stop_words and word not in string.punctuation]
        cleaned_sentence = ' '.join(lemmatized_tokens)
        return cleaned_sentence
    except Exception as e:
        print(f"Error in lemmatize_sentence: {e}")
        return ''

# Remove non-printable characters using regex
def clean_text(text):
    return re.sub(r'[^ -~]', '', text)

# Stop Word Removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word.lower() not in stop_words)

# Loop through each file in the labelled_output folder
for labelled_file_name in os.listdir(f"/home/student/Desktop/NLP_Output/{labelled_output_folder}"):
    # Construct the file paths
    labelled_file_path = f"/home/student/Desktop/NLP_Output/{labelled_output_folder}/{labelled_file_name}"

    try:
        # Read the labelled output text into a DataFrame
        df_labelled = pd.read_csv(labelled_file_path, escapechar='|')

        # Drop rows with missing or empty sentences
        df_labelled = df_labelled.dropna(subset=['sentence'])
        df_labelled = df_labelled[df_labelled['sentence'] != '']

        # Perform preprocessing on each sentence
        df_labelled['pos_tags'] = df_labelled['sentence'].apply(pos_tag_sentence)
        df_labelled['cleaned_sentence'] = df_labelled['pos_tags'].apply(lemmatize_sentence)
        df_labelled['cleaned_sentence'] = df_labelled['cleaned_sentence'].apply(clean_text)
        df_labelled['preprocessed_sentence'] = df_labelled['cleaned_sentence'].apply(remove_punctuation)
        df_labelled['preprocessed_sentence'] = df_labelled['preprocessed_sentence'].apply(remove_stopwords)

        # Create a DataFrame for preprocessed output
        df_preprocessed = pd.DataFrame({
            'sentence': df_labelled['sentence'],
            # 'label': df_labelled['label'],
            'preprocessed_sentence': df_labelled['preprocessed_sentence'],
            # 'pos_tags': df_labelled['pos_tags'],
        })

        # Save the preprocessed DataFrame to a CSV file with an escape character '|'
        preprocessed_file_name = f"{os.path.splitext(labelled_file_name)[0]}_preprocess.csv"
        preprocessed_file_path = f"{preprocessed_output_path}/{preprocessed_file_name}"
        df_preprocessed.to_csv(preprocessed_file_path, index=False, escapechar='|')

        # Display the processed DataFrame
        print(f"Preprocessed data saved to {preprocessed_file_path}")

    except Exception as e:
        print(f"Error processing file: {labelled_file_name}")
        print(e)
        


[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/student/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/32879_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/31053_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/30966_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/32139_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/31462_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/30800_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/32349_tokenized_preprocess.csv
Preprocessed data saved to /home/student/Desktop/NLP_Output/final_validation_preprocess_output/31050_tokenized_preproc

In [7]:
# Feature extraction on preprocessed validation data

In [9]:
import os
import pandas as pd
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import re
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None




# Function to calculate Sentence Length
def calculate_sentence_length(sentence, longest_sentence_length):
    try:
        # Exclude numeric values from tokens
        tokens = [word for word in word_tokenize(str(sentence)) if not word.isnumeric()]

        if not tokens:  # Check if tokens list is empty after excluding numerics
            return format(0.0, '.4f')

        return format(len(tokens) / longest_sentence_length, '.4f')

    except (TypeError, AttributeError) as e:
        print(f"Error in tokenization for sentence: {sentence}. Error: {e}")
        return format(0.0, '.4f')  



# Function to calculate Sentence Position
def calculate_sentence_position(sentence_index, total_sentences):
    try:
        if sentence_index == 0 or sentence_index == total_sentences - 1:
            return 1
        else:
            return format((total_sentences - sentence_index) / total_sentences, '.4f')

    except (TypeError, AttributeError) as e:
        print(f"Error in calculating sentence position for index {sentence_index}. Error: {e}")
        return 0.0  


# Function to calculate Proper Nouns
def calculate_proper_nouns(sentence):
    try:
        tags = pos_tag(word_tokenize(str(sentence)))
        proper_noun_count = sum(1 for word, tag in tags if tag == 'NNP')
        return format(proper_noun_count / len(word_tokenize(str(sentence))),'.4f')

    except (TypeError, AttributeError) as e:
        print(f"Error in POS tagging for sentence: {sentence}. Error: {e}")
        return 0.0  # Handle it appropriately



# Function to calculate Adjectives
def calculate_adjectives(sentence):
    try:
        tags = pos_tag(word_tokenize(str(sentence)))
        adjective_count = sum(1 for word, tag in tags if tag.startswith('JJ'))
        return format(adjective_count / len(word_tokenize(str(sentence))),'.4f')

    except (TypeError, AttributeError) as e:
        print(f"Error in POS tagging for sentence: {sentence}. Error: {e}")
        return 0.0  # Handle it appropriately




# tf-isf
# word_dict contains the no. of times that word present in all sentences
def word_freq(sentences):
  word_dict = {}
  for sent in sentences:
    words = word_tokenize(sent)
    for word in words:
      if word not in word_dict:
        word_dict[word] = 1
      else:
        word_dict[word] += 1
  return word_dict

def tf_isf(sentences):
  word_dict = word_freq(sentences)
  n_sent = len(sentences)
  keys = word_dict.keys()
  tf_isf_score_pre_sentence = []
# isf is calculated below, later isf vales are stored back in word_dict
  for key in keys:
    x = word_dict[key]
    x = math.log(float(n_sent)/x)
    word_dict[key] = x
# temp, stores the no. of times a word appeared in sentence
  for i in range(n_sent):
    sent_val = 0
    sent = sentences[i]
    temp = {}
    words = word_tokenize(sent)
    for word in words:
      if word not in temp:
        temp[word] = 1
      else:
        temp[word] += 1

    keys = temp.keys()
    for key in keys:
      tf = float(temp[key])/len(sent)
      isf = word_dict[key]
      sent_val += (tf*isf)
    tf_isf_score_pre_sentence.append(format(sent_val,'.4f'))
    # print(tf_isf_score_pre_sentence)
  return tf_isf_score_pre_sentence
    

# Function to calculate Cosine Similarity
def cal_cosine_similarity(sentences):
    row_sum=[]
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    similarity_matrix=cosine_similarity(vectorizer)
    for i in range(len(sentences)):
        temp=sum(similarity_matrix[i]) - similarity_matrix[i][i]
        row_sum.append(temp)
    max_similarity_score=max(row_sum)
    for i in range(len(row_sum)):
        
        row_sum[i]= format((float(row_sum[i])/max_similarity_score),'.4f')
    return row_sum
    
def calculate_path(x,preprocessed_file_name):
    number = re.match(r"(\d+)", preprocessed_file_name).group(1)
    return number


# Folder Paths
preprocessed_output_folder = '/home/student/Desktop/NLP_Output/final_validation_preprocess_output'
feature_output_folder = '/home/student/Desktop/NLP_Output/final_validation_feature_output'


df_feature_lis=[]


# Iterate through preprocessed files
for preprocessed_file_name in os.listdir(preprocessed_output_folder):
    preprocessed_file_path = os.path.join(preprocessed_output_folder, preprocessed_file_name)

    try:
        # Read preprocessed file into DataFrame
        df_preprocessed = pd.read_csv(preprocessed_file_path, escapechar='|', encoding='utf-8')


        # Check if the DataFrame is empty
        if df_preprocessed.empty:
            print(f"Warning: DataFrame is empty for {preprocessed_file_path}. Skipping processing.")
            continue


        # Calculate Longest Sentence Length
        longest_sentence_length =  max(df_preprocessed['preprocessed_sentence'].apply(lambda x: len(word_tokenize(x))))

        # Feature Extraction
        # print('BEFORE sentence_length nouns')
        df_preprocessed['sentence_length'] = df_preprocessed['preprocessed_sentence'].apply(
            lambda x: calculate_sentence_length(x, longest_sentence_length)
        )
        # print('AFTER sentence_length nouns')

        df_preprocessed['sentence_position'] = df_preprocessed.index.map(
            lambda x: calculate_sentence_position(x, len(df_preprocessed))
        )
        # print('AFTER sentence_position nouns')
        # print('Before proper nouns')
        df_preprocessed['proper_nouns'] = df_preprocessed['preprocessed_sentence'].apply(
            lambda x: calculate_proper_nouns(x)
        )

        # df_preprocessed['proper_noun'] =  df_preprocessed['preprocessed_sentence'].apply(lambda x: len([pos for word, pos in pos_tag(word_tokenize(x)) if pos == 'NNP']) / len(x.split()) if pd.notnull(x) and len(x.split()) != 0 else 0)

        df_preprocessed['adjectives'] = df_preprocessed['preprocessed_sentence'].apply(
            lambda x: calculate_adjectives(x)
        )

        # df_preprocessed['adjective'] = df_preprocessed.apply(lambda row: len([pos for word, pos in pos_tag(word_tokenize(row['preprocessed_sentence'])) if pos.startswith('JJ')]) / row['sentence_length'] if pd.notnull(row['preprocessed_sentence']) and row['sentence_length'] != 0 else 0, axis=1)
        
        df_preprocessed['tf_isf'] = tf_isf(df_preprocessed['preprocessed_sentence'])
        
        
        df_preprocessed['coisne_similarity']=cal_cosine_similarity(df_preprocessed['preprocessed_sentence'])
        # print('AFTER coisne_similarity nouns')
        df_preprocessed['file_name']= df_preprocessed['preprocessed_sentence'].apply(
            lambda x: calculate_path(x,preprocessed_file_name)
        )


        # Save Feature-Extracted DataFrame
        feature_file_name = f"{os.path.splitext(preprocessed_file_name)[0]}_features.csv"
        feature_file_path = os.path.join(feature_output_folder, feature_file_name)
        df_preprocessed.to_csv(feature_file_path, index=False, escapechar='|')

        print(f"{len(os.listdir(preprocessed_output_folder))}. Features extracted and saved to {feature_file_path}")
        df_feature_lis.append(df_preprocessed)

    
    except Exception as e:
        print(f"Error processing file {preprocessed_file_path}: {e}")



# Concatenate all DataFrames in the list
final_feature_df = pd.concat(df_feature_lis, ignore_index=True)

# Save the final DataFrame to a CSV file
final_feature_output_file_path = "/home/student/Desktop/NLP_Output/final_validation_feature_output_data_combined.csv"
final_feature_df.to_csv(final_feature_output_file_path, index=False, escapechar='|')

# Display the final processed DataFrame
print(f"Final processed data saved to {final_feature_output_file_path}")





363. Features extracted and saved to /home/student/Desktop/NLP_Output/final_validation_feature_output/32025_tokenized_preprocess_features.csv
363. Features extracted and saved to /home/student/Desktop/NLP_Output/final_validation_feature_output/31828_tokenized_preprocess_features.csv
363. Features extracted and saved to /home/student/Desktop/NLP_Output/final_validation_feature_output/31064_tokenized_preprocess_features.csv
363. Features extracted and saved to /home/student/Desktop/NLP_Output/final_validation_feature_output/32966_tokenized_preprocess_features.csv
363. Features extracted and saved to /home/student/Desktop/NLP_Output/final_validation_feature_output/32032_tokenized_preprocess_features.csv
Error processing file /home/student/Desktop/NLP_Output/final_validation_preprocess_output/30779_tokenized_preprocess.csv: expected string or bytes-like object
363. Features extracted and saved to /home/student/Desktop/NLP_Output/final_validation_feature_output/32657_tokenized_preprocess_fe

In [None]:
#  After finishing feature extraction, take the ouput and give to the trained model in file

In [None]:
# load the KNN model correctly, here this current ipynb file and knn model are inside same directory i.e, Home

In [4]:
import pandas as pd
import joblib


validation_df = pd.read_csv('/home/student/Desktop/NLP_Output/final_validation_feature_output_data_combined.csv')  # Replace with the actual file path
# Only numerical data should be given as input, so drop sentence, preprocessed_sentence
validation_df.pop('sentence')
validation_df.pop('preprocessed_sentence') 
# print(validation_df.head())
# Separate features (X) - assuming all relevant feature columns are present
X_validation = validation_df[['sentence_length', 'sentence_position', 'proper_nouns', 'adjectives', 'tf_isf', 'coisne_similarity','file_name']]

# Load the trained models

knn_model = joblib.load('knn_model.joblib')


# Make predictions using the loaded models

knn_predictions = knn_model.predict(X_validation)


# Add columns to the validation DataFrame with predicted labels

validation_df['predicted_label_knn'] = knn_predictions


# Display the modified DataFrame

validation_df.to_csv('/home/student/Desktop/NLP_Output/final_test_predicted_output.csv')



In [None]:
# Based on file_name as common column, we are merging validation_feature_combined, validation_fea_test_ouput_combined
# merging into validation_fea_test_ouput_combined, by this we have sentences along with predicted labels

In [None]:
# adding back sentence, preprocessed_sentence columns back
# kernel is getting crashed, for below part

In [None]:
import pandas as pd

# Assuming validation_df is already in memory
# Assuming validation_feature_combined.csv is a CSV file

# Load the CSV file into a DataFrame
validation_fea_combined = pd.read_csv('/home/student/Desktop/NLP_Output/final_validation_feature_output_data_combined.csv')  
validation_fea_combined.pop('preprocessed_sentence')
validation_fea_test_ouput_combined=pd.read_csv('/home/student/Desktop/NLP_Output/final_test_predicted_output.csv')
# validation_feature_test_ouput_combined.drop(columns=['Unnamed: 0'], inplace=True)
# Merge the dataframes based on a common column, for example, 'id'
merged_df = pd.merge(validation_fea_test_ouput_combined, validation_fea_combined[['file_name', 'sentence']], on='file_name', how='left')

# Display the merged dataframe
# print(merged_df)

merged_df.to_csv('/home/student/Desktop/NLP_Output/final_extractive_summary_output.csv')

In [None]:
# since kernel is getting crashes, we are doing it in chunks

In [1]:
import sys
sys.setrecursionlimit(10**6)

In [1]:
import pandas as pd

# Define the file paths
feature_combined_path = '/home/student/Desktop/NLP_Output/final_validation_feature_output_data_combined.csv'
test_output_combined_path = '/home/student/Desktop/NLP_Output/final_validation_test_predicted_output.csv'
output_csv_path = '/home/student/Desktop/NLP_Output/extractive_summary_output.csv'

# Define chunk size for reading CSV files
chunksize = 100  # Adjust this value based on your system's memory capacity

# Define an iterator to read the feature_combined CSV file in chunks
feature_combined_iter = pd.read_csv(feature_combined_path, chunksize=chunksize)

# Define an iterator to read the test_output_combined CSV file in chunks
test_output_combined_iter = pd.read_csv(test_output_combined_path, chunksize=chunksize)

# Initialize an empty list to store merged chunks
merged_chunks = []

# Iterate over both iterators simultaneously and merge the chunks
for feature_chunk, test_chunk in zip(feature_combined_iter, test_output_combined_iter):
    # Merge the chunks based on the common column 'file_name'
    merged_chunk = pd.merge(test_chunk, feature_chunk[['file_name', 'preprocessed_sentence', 'sentence']], on='file_name', how='left')
    # Append the merged chunk to the list
    merged_chunks.append(merged_chunk)

# Concatenate all merged chunks into a single DataFrame
merged_df = pd.concat(merged_chunks)

# Display the merged dataframe
print(merged_df)

# Save the merged dataframe to a new CSV file
merged_df.to_csv(output_csv_path, index=False)


      Unnamed: 0  sentence_length  sentence_position  proper_nouns  \
0              0           0.1143                1.0        0.5000   
1              0           0.1143                1.0        0.5000   
2              0           0.1143                1.0        0.5000   
3              0           0.1143                1.0        0.5000   
4              0           0.1143                1.0        0.5000   
...          ...              ...                ...           ...   
8095      686289           0.0078                1.0        0.6923   
8096      686289           0.0078                1.0        0.6923   
8097      686289           0.0078                1.0        0.6923   
8098      686289           0.0078                1.0        0.6923   
8099      686289           0.0078                1.0        0.6923   

      adjectives  tf_isf  coisne_similarity  file_name  predicted_label_knn  \
0         0.0385  0.4901             0.5990      32025                  0.0   
1

In [None]:
# take the predicted output, trying to generate summary

In [1]:
import pandas as pd
import os
import re
# Load the CSV file
csv_file_path = '/home/student/Desktop/NLP_Output/extractive_summary_output.csv'
df = pd.read_csv(csv_file_path)

word_number_pattern = re.compile(r'\b\w+\b')

# Filter rows where label is equal to 1
df_positive_labels = df[df['predicted_label_knn'] == 1]

# Create a directory to store the output files
output_directory = '/home/student/Desktop/NLP_Output/final_KNN_summary_word_res_(sent)'
os.makedirs(output_directory, exist_ok=True)

# Group by 'file_name' and iterate through each group
for file_name, group_df in df_positive_labels.groupby('file_name'):
    # Create a text file with the same name as the 'file_name' inside the output directory
    output_file_path = os.path.join(output_directory, 'fns'+str(file_name)+'_'+str(file_name)+'.txt')

    word_count = 0
    summary = ""
    with open(output_file_path, 'w', encoding='utf-8') as file:
      for sentence in group_df['sentence']:
        # Split the sentence into words including numbers
        words = word_number_pattern.findall(sentence)
        # Check if adding the current sentence would exceed the word limit
        for word in words:
            # Increment word count for each word including numbers
            word_count += 1
            if word_count <= 1000:
                summary += word + ' '  # Append the word to the summary
            else:
                break  # Exit the loop if the word limit is reached
        else:
             summary = summary.strip()+'. '
             continue  # Continue to the next sentence if the word limit is not reached
        break  # Exit the loop if the word limit is reached

    # Write the summary to the file
      file.write(summary)

# Print a message indicating the process is complete
print("Text files generated successfully in the 'output_files' directory.")


Text files generated successfully in the 'output_files' directory.


In [None]:
# /home/student/Desktop/FNS_dataset/validation/System , here inside System folder,upload the generated summaries of KNN

In [None]:
# Now run the rouge jar file and see the results in results.csv file inside v1.2.2

In [None]:
# calculating avg of rouge 

In [3]:
import pandas as pd



# Load the CSV file into a DataFrame
file_path = '/home/student/Desktop/rouge2_v1.2.2_runnable/v1.2.2/results.csv'
df = pd.read_csv(file_path)

# Filter rows where ROUGE-Type is 'ROUGE-L+StopWordRemoval'
rouge_l_stopword_removal_df = df[df['ROUGE-Type'] == 'ROUGE-L+StopWordRemoval']

# Calculate the average values
average_avg_recall = rouge_l_stopword_removal_df['Avg_Recall'].mean()
average_avg_precision = rouge_l_stopword_removal_df['Avg_Precision'].mean()
average_avg_f_score = rouge_l_stopword_removal_df['Avg_F-Score'].mean()

# Add new columns to the original DataFrame
df['Average of ROUGE-L+StopWordRemoval in Avg_Recall'] = average_avg_recall
df['Average of ROUGE-L+StopWordRemoval in Avg_Precision'] = average_avg_precision
df['Average of ROUGE-L+StopWordRemoval in Avg_F-Score'] = average_avg_f_score

# Save the modified DataFrame back to the original CSV file
output_file_path = '/home/student/Desktop/NLP_Output/final_avg_knn_rouge.csv'
df.to_csv(output_file_path, index=False)

print("Average Avg_Recall:", average_avg_recall)
print("Average Avg_Precision:", average_avg_precision)
print("Average Avg_F-Score:", average_avg_f_score)
print("New CSV file with additional average columns has been saved.")

Average Avg_Recall: 0.5319071428571428
Average Avg_Precision: 0.49219285714285715
Average Avg_F-Score: 0.5009657142857142
New CSV file with additional average columns has been saved.
