In [None]:
import pandas as pd

df = pd.read_csv("/kaggle/input/multilingual-customer-support-tickets/dataset-tickets-multi-lang-4-20k.csv")  

df_english = df[df["language"] == "en"].reset_index(drop=True)

df_english = df_english[["body"]]

In [9]:
df_english

Unnamed: 0,body
0,Seeking information on digital strategies that...
1,I am contacting you to request information on ...
2,"Dear Customer Support, I am reaching out to in..."
3,Inquiring about best practices for securing me...
4,"The integration stopped working unexpectedly, ..."
...,...
11918,Seeking details on securing medical data using...
11919,Can you provide information on digital strateg...
11920,Request for assistance in improving digital ma...
11921,I am facing integration problems with IFTTT Do...


In [None]:
from transformers import pipeline

paraphrase_pipe = pipeline(
    "text2text-generation",
    model="Vamsi/T5_Paraphrase_Paws",  
    tokenizer="Vamsi/T5_Paraphrase_Paws",
    device=0 
)


Device set to use cuda:0


In [None]:
def paraphrase_text(batch):
    paraphrased_bodies = []
    
    for text in batch["body"]:
        results = paraphrase_pipe(
            text, 
            max_length=50, 
            num_return_sequences=3,  
            temperature=0.9,  
            top_p=0.85,  
            do_sample=True
        )
        
        if isinstance(results, list) and results and isinstance(results[0], dict):
            paraphrased_texts = [res["generated_text"] for res in results]
        elif isinstance(results, list) and results and isinstance(results[0], str):
            paraphrased_texts = results
        else:
            paraphrased_texts = [str(results)]
            
        paraphrased_texts = list(set(paraphrased_texts))
        
        chosen_text = paraphrased_texts[1] if len(paraphrased_texts) > 1 else paraphrased_texts[0]
        paraphrased_bodies.append(chosen_text)
    
    return {"paraphrased_body": paraphrased_bodies}

In [31]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_english)
dataset = dataset.shuffle(seed=40).select(range(len(dataset) - 2000, len(dataset)))

In [32]:
dataset

Dataset({
    features: ['body'],
    num_rows: 2000
})

In [33]:
dataset_paraphrased = dataset.map(paraphrase_text, batched=True, batch_size=8)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
dataset_paraphrased[3]

{'answer': 'Dear [Name], thank you for reaching out to our customer support team regarding investment data analysis tools. We would be happy to provide guidance on how to effectively utilize these tools to make informed investment decisions. To better assist you, could you please provide us with information on the specific tools you are currently using and the type of investments you are looking to optimize? This will enable us to provide tailored recommendations. Would prefer to discuss this over the phone, please let us know a suitable time.',
 'paraphrased_answer': 'Dear [Name], thank you for reaching out to our customer support team regarding investment data analysis tools . We would be happy to provide guidance on how to effectively use these tools to make informed investment decisions . To better assist you, please provide us with information on the specific tools you are currently using and the type of investments you are looking to optimize , this will enable us to provide tail

In [34]:
dataset_paraphrased = dataset_paraphrased.to_pandas()

In [40]:
dataset_paraphrased

Unnamed: 0,body,paraphrased_body
0,Can you provide information on optimizing inve...,I am interested to know which tools can help m...
1,I am contacting you to inquire about optimizin...,I am contacting you to inquire about optimizin...
2,I am contacting you to ask about the analytics...,I am contacting you to ask about the analytics...
3,Seeking details on securing medical data using...,Seeking details on securing medical data using...
4,Our hospital systems are facing data security ...,Our hospital systems are facing data security ...
...,...,...
1995,The project management platform has encountere...,The project management platform has encountere...
1996,I noticed there were unexpected charges on my ...,I noticed that there were unexpected charges o...
1997,Customers are facing challenges integrating Zo...,Customers are facing challenges in integrating...
1998,I would like to know more about the data analy...,I would like to know more about the data analy...


In [None]:
csv_filename = "answers_paraphrased.csv"
dataset_paraphrased.to_csv(csv_filename, index=False)
from IPython.display import FileLink
FileLink(csv_filename)

In [44]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer("/kaggle/input/sbert_v2/transformers/default/1/fine_tuned_kbqa_sbert")

In [45]:
original_sentences = dataset_paraphrased["body"].tolist()
paraphrased_sentences = dataset_paraphrased["paraphrased_body"].tolist()

In [46]:
embeddings_original = sbert_model.encode(original_sentences, convert_to_tensor=True)
embeddings_paraphrased = sbert_model.encode(paraphrased_sentences, convert_to_tensor=True)

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

In [48]:
from sentence_transformers import util
cosine_scores = util.cos_sim(embeddings_original, embeddings_paraphrased)

In [50]:
scores = [cosine_scores[i][i].item() for i in range(len(original_sentences))]
average_score = sum(scores) / len(scores)

In [51]:
print(f"Average Cosine Similarity: {average_score:.4f}")

Average Cosine Similarity: 0.9994


In [53]:
import random

random.shuffle(paraphrased_sentences)

In [54]:
embeddings_original = sbert_model.encode(original_sentences, convert_to_tensor=True)
embeddings_paraphrased = sbert_model.encode(paraphrased_sentences, convert_to_tensor=True)

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

In [55]:
cosine_scores = util.cos_sim(embeddings_original, embeddings_paraphrased)

In [56]:
scores = [cosine_scores[i][i].item() for i in range(len(original_sentences))]
average_score = sum(scores) / len(scores)

In [57]:
print(f"Average Cosine Similarity: {average_score:.4f}")

Average Cosine Similarity: 0.9927


In [59]:
original_sentences[0]

'Can you provide information on optimizing investments through the use of data analytics services? I am interested in knowing which tools can assist in making informed decisions. Thanks for your assistance.'

In [60]:
paraphrased_sentences[0]

'I am writing to report that the login page is slow loading . I have tried troubleshooting by clearing my browser cache and verifying my network connection but the problem continues . I suspect the issue might be due to server overload or'

In [61]:
embeddings_original = sbert_model.encode(original_sentences[0], convert_to_tensor=True)
embeddings_paraphrased = sbert_model.encode(paraphrased_sentences[0], convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [62]:
cosine_scores = util.cos_sim(embeddings_original, embeddings_paraphrased)

In [63]:
cosine_scores

tensor([[0.9818]], device='cuda:0')

In [112]:
embeddings_original = sbert_model.encode("I love you", convert_to_tensor=True)
embeddings_paraphrased = sbert_model.encode("I want to buy a cookies", convert_to_tensor=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [113]:
cosine_scores = util.cos_sim(embeddings_original, embeddings_paraphrased)

In [114]:
cosine_scores

tensor([[0.9061]], device='cuda:0')

In [88]:
score = util.euclidean_sim(embeddings_original, embeddings_paraphrased)

In [89]:
score

tensor([[-0.]])

In [None]:
vec1 = sbert_model.encode("I love you", convert_to_numpy=True)
vec2 = sbert_model.encode("I adore you", convert_to_numpy=True)

# squared Euclidean distance (theo cách FAISS IndexFlatL2)
squared_euclidean_distance = np.sum((vec1 - vec2) ** 2)
print(f"Squared Euclidean Distance: {squared_euclidean_distance:.4f}")

euclidean_distance = np.sqrt(squared_euclidean_distance)
print(f"Euclidean Distance: {euclidean_distance:.4f}")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Squared Euclidean Distance: 1.1294
Euclidean Distance: 1.0627


In [None]:
import pandas as pd

# Load file CSV
df = pd.read_csv("/kaggle/input/multilingual-customer-support-tickets/dataset-tickets-multi-lang-4-20k.csv")  

df_english = df[df["language"] == "en"].reset_index(drop=True)

df_english = df_english[["answer"]]

In [None]:
from transformers import pipeline

paraphrase_pipe = pipeline(
    "text2text-generation",
    model="Vamsi/T5_Paraphrase_Paws",  
    tokenizer="Vamsi/T5_Paraphrase_Paws",
    device=0 
)


Device set to use cuda:0


In [None]:
def generate_paraphrase_batch(batch):
    print("Batch keys:", list(batch.keys()))
    sentences = batch.get("answer", [])
    if not sentences:
        print("Không tìm thấy key 'answer' trong batch!")
    outputs = paraphrase_pipe(sentences, batch_size=len(sentences))
    paraphrases = [out[0]["generated_text"] for out in outputs]
    formatted = [
        f"sentence: {sent}|paraphrase: {para}"
        for sent, para in zip(sentences, paraphrases)
    ]
    return {"formatted": formatted}

In [121]:
from datasets import Dataset

dataset = Dataset.from_pandas(df_english)
dataset = dataset.shuffle(seed=40).select(range(2000))

In [129]:
dataset

Dataset({
    features: ['answer'],
    num_rows: 2000
})

In [None]:
def generate_paraphrase(example):
    sentence = example["answer"]
    output = paraphrase_pipe(sentence)[0]["generated_text"]
    return {"formatted": f"sentence: {sentence}|paraphrase: {output}"}

dataset = dataset.filter(lambda x: x["answer"] is not None and x["answer"].strip() != "")

new_dataset = dataset.map(generate_paraphrase, batched=False)
print(new_dataset[0]["formatted"])


Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

sentence: <name> sincerely apologize for the inconvenience you are experiencing with system crashes during the analysis of investment data. We are happy to assist you in resolving this issue. Although we have already taken steps such as rebooting devices and reinstalling PowerDirector, further assistance would require gathering more information. Please provide the exact error message you receive during system crashes, along with the operating system version. Additionally, knowing your computer's processor and RAM specifications would be very helpful in narrowing down the cause of the issue.|paraphrase: Although we have already taken steps such as rebooting devices and reinstalling PowerDirector,


In [140]:
model_path = "/kaggle/input/gpt2_v2/transformers/default/1"
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
def generate_paraphrase_gpt2(prompt, max_length=1024):
    """Sinh paraphrase từ GPT2 dựa trên prompt"""
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(input_ids.shape, device=device) 

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id  # Thiết lập pad_token_id đúng cách
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


def evaluate_gpt2(example):
    """
    Từ chuỗi formatted dạng:
        "sentence: {a sentence}|paraphrase: {a paraphrase}"
    Tách prompt cho GPT2 và ground truth.
    """
    formatted = example["formatted"]
    try:
        prompt_part, reference = formatted.split("|paraphrase:")
    except Exception as e:
        prompt_part, reference = formatted, ""
    prompt = prompt_part + "|paraphrase:"
    reference = reference.strip()
    
    output = generate_paraphrase_gpt2(prompt)
    
    return {"gpt2_output": output, "reference": reference}

In [148]:
eval_dataset = new_dataset.map(evaluate_gpt2)



Map:   0%|          | 0/1999 [00:00<?, ? examples/s]

NameError: name 'evaluate' is not defined

In [None]:
import evaluate


print("Sample predictions:", predictions[:3])
print("Sample references:", references[:3])



Sample predictions: [['sentence:', '<name>', 'sincerely', 'apologize', 'for', 'the', 'inconvenience', 'you', 'are', 'experiencing', 'with', 'system', 'crashes', 'during', 'the', 'analysis', 'of', 'investment', 'data.', 'We', 'are', 'happy', 'to', 'assist', 'you', 'in', 'resolving', 'this', 'issue.', 'Although', 'we', 'have', 'already', 'taken', 'steps', 'such', 'as', 'rebooting', 'devices', 'and', 'reinstalling', 'PowerDirector,', 'further', 'assistance', 'would', 'require', 'gathering', 'more', 'information.', 'Please', 'provide', 'the', 'exact', 'error', 'message', 'you', 'receive', 'during', 'system', 'crashes,', 'along', 'with', 'the', 'operating', 'system', 'version.', 'Additionally,', 'knowing', 'your', "computer's", 'processor', 'and', 'RAM', 'specifications', 'would', 'be', 'very', 'helpful', 'in', 'narrowing', 'down', 'the', 'cause', 'of', 'the', 'issue.|paraphrase:', 'name>', 'is', 'sincerely', 'apologetic', 'and', 'sincerely', 'sorry', 'for', 'this', 'inconvenience', 'caused

In [165]:
predictions = [" ".join(pred) if isinstance(pred, list) else pred for pred in predictions]

In [166]:
references = [[" ".join(ref[0])] if isinstance(ref[0], list) else [" ".join(ref)] for ref in references]

In [167]:
!pip install nltk



In [None]:
from nltk.translate.bleu_score import corpus_bleu

predictions = [" ".join(pred) if isinstance(pred, list) else pred for pred in predictions]
references = [[" ".join(ref[0])] if isinstance(ref[0], list) else [ref] for ref in references]


In [None]:
# Nếu references có mức lồng thừa, chuyển thành list[list[str]]
references = [[ " ".join(ref[0]) ] if isinstance(ref[0], list) else [ref] for ref in references]

# Nếu predictions vẫn là list từ, gộp lại thành câu
predictions = [" ".join(pred) if isinstance(pred, list) else pred for pred in predictions]


In [171]:
print("Sample predictions:", predictions[:3])
print("Sample references:", references[:3])


Sample predictions: ["sentence: <name> sincerely apologize for the inconvenience you are experiencing with system crashes during the analysis of investment data. We are happy to assist you in resolving this issue. Although we have already taken steps such as rebooting devices and reinstalling PowerDirector, further assistance would require gathering more information. Please provide the exact error message you receive during system crashes, along with the operating system version. Additionally, knowing your computer's processor and RAM specifications would be very helpful in narrowing down the cause of the issue.|paraphrase: name> is sincerely apologetic and sincerely sorry for this inconvenience caused by the system crash of your system . We will work to resolve this problem as soon as possible . However, it is possible that we may need to gather additional information from you to further help you . If you wish to discuss this further, please let us know a convenient time for a call.",

In [172]:
from nltk.translate.bleu_score import corpus_bleu

# Tokenize predictions
tokenized_predictions = [pred.split() for pred in predictions]  # list[list[str]]

# Tokenize references (phải là list[list[list[str]]])
tokenized_references = [[ref[0].split()] for ref in references]  # list[list[list[str]]]

# Kiểm tra lại dữ liệu
print("Sample tokenized predictions:", tokenized_predictions[:3])
print("Sample tokenized references:", tokenized_references[:3])


Sample tokenized predictions: [['sentence:', '<name>', 'sincerely', 'apologize', 'for', 'the', 'inconvenience', 'you', 'are', 'experiencing', 'with', 'system', 'crashes', 'during', 'the', 'analysis', 'of', 'investment', 'data.', 'We', 'are', 'happy', 'to', 'assist', 'you', 'in', 'resolving', 'this', 'issue.', 'Although', 'we', 'have', 'already', 'taken', 'steps', 'such', 'as', 'rebooting', 'devices', 'and', 'reinstalling', 'PowerDirector,', 'further', 'assistance', 'would', 'require', 'gathering', 'more', 'information.', 'Please', 'provide', 'the', 'exact', 'error', 'message', 'you', 'receive', 'during', 'system', 'crashes,', 'along', 'with', 'the', 'operating', 'system', 'version.', 'Additionally,', 'knowing', 'your', "computer's", 'processor', 'and', 'RAM', 'specifications', 'would', 'be', 'very', 'helpful', 'in', 'narrowing', 'down', 'the', 'cause', 'of', 'the', 'issue.|paraphrase:', 'name>', 'is', 'sincerely', 'apologetic', 'and', 'sincerely', 'sorry', 'for', 'this', 'inconvenience

In [173]:
bleu_score = corpus_bleu(tokenized_references, tokenized_predictions)

print(f"BLEU Score: {bleu_score:.4f}")

BLEU Score: 0.1179


In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = [scorer.score(pred, ref[0]) for pred, ref in zip(predictions, references)]

rouge1 = sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores)
rouge2 = sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores)
rougeL = sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores)

print(f"ROUGE-1: {rouge1:.4f}")
print(f"ROUGE-2: {rouge2:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")


ROUGE-1: 0.3071
ROUGE-2: 0.2844
ROUGE-L: 0.3017


In [179]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load lại model và tokenizer
model.eval()
def calculate_perplexity(text):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids.to("cuda")
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
    
    return perplexity

# Tính perplexity trung bình
ppl_scores = [calculate_perplexity(pred) for pred in predictions]
average_ppl = sum(ppl_scores) / len(ppl_scores)

print(f"Perplexity (PPL): {average_ppl:.4f}")

Perplexity (PPL): 4.8845
