### Data Preparation

In [7]:
import pandas as pd
import torch
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from torch.utils.data import Dataset, DataLoader


In [8]:
df = pd.read_csv('../result/ArgKP21+predictions(v2).csv')
print(df.columns)

Index(['topic', 'stance', 'arguments', 'kep_points', 'predict_kps'], dtype='object')


In [9]:
import ast
train_input = []
train_summary = []
val_input = []
val_summary = []
test_input = []
test_summary = []
for index,row in df.iterrows():
    input = ' '
    output = ' '
    arguments = df.at[index, 'arguments']
    summaries = df.at[index, 'predict_kps']
    arguments = ast.literal_eval(arguments)
    summaries = ast.literal_eval(summaries)
    for argument in arguments:
        if not argument.endswith('.'):
            argument += '.'
        input += argument
    for summary in summaries:
        if not summary.endswith('.'):
            summary += '.'
        output += summary
    if index in (56,57,58,59,60,61):
        test_input.append(input)
        test_summary.append(output)
    elif index in (8,9,16,17,18,19,30,31):
        val_input.append(input)
        val_summary.append(output)
    else:
        train_input.append(input)
        train_summary.append(output)

In [10]:
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=512, max_target_length=128):
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.texts = texts
        self.summaries = summaries

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        input_text = self.texts[idx]
        target_summary = self.summaries[idx]

        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        targets = self.tokenizer.encode(
            target_summary,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets.flatten()
        }

In [11]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from transformers import AdamW

class SummarizationModel(pl.LightningModule):
    def __init__(self, model_name, tokenizer, learning_rate=2e-5):
        super(SummarizationModel, self).__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.tokenizer = tokenizer
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        outputs = self(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        self.log('test_loss', loss)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.learning_rate)


### Training Phase

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "hyunwoongko/ctrlsum-cnndm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = SummarizationDataset(train_input, train_summary, tokenizer)
val_dataset = SummarizationDataset(val_input, val_summary, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)
model = SummarizationModel(model_name=model_name, tokenizer=tokenizer)

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='./checkpoint',
    filename='nli_model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min'
)

early_stopping = EarlyStopping(
    monitor="val_loss",
    min_delta=0.01,
    patience=3
)

# Determine the accelerator type based on GPU availability
accelerator = "gpu" if torch.cuda.is_available() else "cpu"

# Define trainer
trainer = Trainer(
    min_epochs=0,  # Adjust as needed
    max_epochs=20,
    callbacks=[checkpoint_callback, early_stopping],
    accelerator=accelerator  # Automatically selects GPU if available, otherwise uses CPU
)

# Train the model
trainer.fit(model, train_loader, val_loader)
print("Train finished")

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
G:\Program Files\KPA prompt-based\venv\lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
G:\Program Files\KPA prompt-based\venv\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:653: Checkpoint directory G:\Program Files\KPA prompt-based\ctrlsum\checkpoint exists and is not empty.

  | Name  | Type                         | Params
----------------------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

G:\Program Files\KPA prompt-based\venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Train finished


G:\Program Files\KPA prompt-based\venv\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


### Predict

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_checkpoint = './checkpoint/nli_model-epoch=02-val_loss=1.85.ckpt'
# Load the model from the checkpoint
model_name = "hyunwoongko/ctrlsum-cnndm"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
loaded_model = SummarizationModel.load_from_checkpoint(model_name=model_name,tokenizer = tokenizer, checkpoint_path=model_checkpoint )


# Set the model to evaluation mode
loaded_model.eval()
print("hello world")



hello world


### Test_data

In [75]:
import ast
test_argument_list = df.at[61,'arguments']
test_arguments = ast.literal_eval(test_argument_list)
test_arguments = test_arguments[:50]
print(len(test_arguments))

50


In [76]:
test_input = ' '
for argument in test_arguments:
    if not argument.endswith('.'):
        argument += '.'
    test_input += argument

In [77]:
print(test_input)

 he state of anguish that many of the large cities of the USA are experiencing goes against the quality of life. Laws in some inflexible cases go against ordinary citizens.healthcare and education are extremely expensive to middle class.High crime rates, racism, xenophobia, high tax rates, many negative points, it is not a good country to live in.I'm afraid there are some worries, because of the racism there, and that strangers might not get the same good treatment.If you have a dark skin color or you are Latin the opportunities are reduced to the minimum ... I consider that there are other countries like Europe for example ... the United States is not the chimera.in some parts you are not given the opportunity to work just because you are an immigrant or because of your skin color.in the USA the health system is very expensive and discriminates against the poor population.It could be said that they are also in the middle of the war and are prone to one day suffer a disaster.it doesn't

In [78]:
# Example text input
# input_text = "today plan => My name is Kevin. I love dogs. I loved dogs from 1996. Today, I'm going to walk on street with my dogs"
input_text = "USA good=> "+test_input
# Tokenize input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate summary
summary_ids = loaded_model.model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], num_beams=5,max_length=300, early_stopping=True)

# Decode and print generated summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)

Generated Summary:  The USA is not a good country to live in because of its high cost of living and lack of healthcare for all citizens.The USA has a high rate of violence and is often seen as a hotbed of racism against minority cultures.The country's political system is often viewed as being too restrictive, leading to a lack of political stability and a perceived lack of accomplishment.The U.S. does not provide universal health care or education, making it difficult for the middle class to access healthcare and education.The culture of unbridled consumerism, fueled by an endless search for money and status, makes the USA a difficult place to live.


In [None]:
###
'''
Generated Summary:  The USA is not a good country to live in because of its high cost of living and lack of healthcare for all citizens.The USA has a high rate of violence and is often seen as a hotbed of racism against minority cultures.The country's political system is often viewed as being too restrictive, leading to a lack of political stability and a perceived lack of accomplishment.The U.S. does not provide universal health care or education, making it difficult for the middle class to access healthcare and education.The culture of unbridled consumerism, fueled by an endless search for money and status, makes the USA a difficult place to live.
'''

In [90]:
import textstat
import language_tool_python
import numpy as np
import networkx as nx

# Example sentences
sentences = [
    "The USA is not a good country to live in because of its high cost of living and lack of healthcare for all citizens.",
    "The USA has a high rate of violence and is often seen as a hotbed of racism against minority cultures.",
    "The country's political system is often viewed as being too restrictive, leading to a lack of political stability and a perceived lack of accomplishment.",
    "The U.S. does not provide universal health care or education, making it difficult for the middle class to access healthcare and education.",
    "The culture of unbridled consumerism, fueled by an endless search for money and status, makes the USA a difficult place to live."
]

# Function to calculate length of sentences
def calculate_length(sentences):
    return np.array([len(sentence.split()) for sentence in sentences])


# Function to calculate grammar score using language_tool_python
def calculate_grammar_score(sentences):
    tool = language_tool_python.LanguageTool('en-US')
    grammar_scores = []
    for sentence in sentences:
        matches = tool.check(sentence)
        if len(matches) > 0:
            grammar_score = len(matches)  # Number of grammar errors found
        else:
            grammar_score = 0  # Default score when no errors found
        grammar_scores.append(grammar_score)
    return np.array(grammar_scores)

# Function to calculate FKGL scores using textstat library
def calculate_fkgl(sentences):
    return np.array([textstat.flesch_kincaid_grade(sentence) for sentence in sentences])

# Dummy function for GFI score (replace with real implementation)
def calculate_gfi(sentences):
    return np.array([textstat.gunning_fog(sentence) for sentence in sentences])  # Example: random scores between 0 and 1

# Step 1: Calculate metrics for each sentence
lengths = calculate_length(sentences)
grammar_scores = calculate_grammar_score(sentences)
fkgl_scores = calculate_fkgl(sentences)
gfi_scores = calculate_gfi(sentences)

# Step 2: Normalize metrics

def normalize_metric(metric):
    min_value = np.min(metric)
    max_value = np.max(metric)

    if max_value - min_value == 0:
        # Handle case where all values are the same (division by zero)
        return np.zeros_like(metric) if isinstance(metric, np.ndarray) else 0.0000

    normalized_values = (metric - min_value) / (max_value - min_value)
    return normalized_values

norm_lengths = normalize_metric(lengths)
norm_grammar_scores = normalize_metric(grammar_scores)
norm_fkgl_scores = normalize_metric(fkgl_scores)
norm_gfi_scores = normalize_metric(gfi_scores)

# Step 3: Print normalized metrics
for i, sentence in enumerate(sentences):
    print(f"Sentence: {sentence}")
    print(f"Normalized Length: {norm_lengths[i]:.4f}")
    print(f"Normalized Grammar Score: {norm_grammar_scores[i]:.4f}")
    print(f"Normalized FKGL Score: {norm_fkgl_scores[i]:.4f}")
    print(f"Normalized GFI Score: {norm_gfi_scores[i]:.4f}")


# Step 4: Construct composite similarity matrix (example: simple average)
combined_scores = (norm_lengths + norm_grammar_scores + norm_fkgl_scores + norm_gfi_scores) / 4
combined_sim_matrix = np.outer(combined_scores, combined_scores)

# Step 5: Build the graph
G = nx.from_numpy_array(combined_sim_matrix)

# Step 6: Calculate PageRank
pagerank_scores = nx.pagerank(G, alpha=0.85, max_iter=200)

# Step 7: Rank sentences based on PageRank scores
ranked_sentences = sorted(((pagerank_scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)

# Step 8: Print ranked sentences
for score, sentence in ranked_sentences:
    print(f"PageRank Score: {score:.4f} - Sentence: {sentence}")


Sentence: The USA is not a good country to live in because of its high cost of living and lack of healthcare for all citizens.
Normalized Length: 1.0000
Normalized Grammar Score: 0.0000
Normalized FKGL Score: 0.0635
Normalized GFI Score: 0.0000
Sentence: The USA has a high rate of violence and is often seen as a hotbed of racism against minority cultures.
Normalized Length: 0.0000
Normalized Grammar Score: 0.0000
Normalized FKGL Score: 0.0000
Normalized GFI Score: 0.1460
Sentence: The country's political system is often viewed as being too restrictive, leading to a lack of political stability and a perceived lack of accomplishment.
Normalized Length: 1.0000
Normalized Grammar Score: 0.0000
Normalized FKGL Score: 1.0000
Normalized GFI Score: 1.0000
Sentence: The U.S. does not provide universal health care or education, making it difficult for the middle class to access healthcare and education.
Normalized Length: 0.5000
Normalized Grammar Score: 0.0000
Normalized FKGL Score: 0.6984
Norm