In [None]:
!pip install evaluate



Import libraries

In [None]:
import os
from typing import List, Tuple
import nltk
import torch
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
import evaluate
import numpy as np
import datasets
from datasets import load_dataset, Dataset, concatenate_datasets
from nltk.tokenize import sent_tokenize
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
    DataCollatorForSeq2Seq,
)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

- Read train data

In [None]:
def read_files(directory, label):
    texts = []
    labels = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
                labels.append(label)
    return texts, labels

In [None]:
spam_texts, spam_labels = read_files("TrainData/spam/", "spam")
notspam_texts, notspam_labels = read_files("TrainData/notspam/", "notspam")
all_texts = spam_texts + notspam_texts
all_labels = spam_labels + notspam_labels
df = pd.DataFrame({'text': all_texts, 'label': all_labels})

In [None]:
df.head()

Unnamed: 0,text,label
0,Subject: great part-time summer job !\n\n* * *...,spam
1,Subject: advertsing ? legal ! ! offer smtp ! !...,spam
2,Subject: lists software worldwide\n\norder for...,spam
3,Subject: free trial membership\n\nlatest adult...,spam
4,Subject: auto insurance rates too high ?\n\nde...,spam


- Shuffle data

In [None]:
df = shuffle(df, random_state=42).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,text,label
0,"Subject: gent conference\n\n"" listserv "" inter...",notspam
1,Subject: psycholinguistics postdoc\n\npostdoct...,notspam
2,Subject: q : fundamental frequency software\n\...,notspam
3,Subject: re : 3 . 427 innateness\n\njoe stembe...,notspam
4,Subject: syntax textbooks - - summary\n\nweek ...,notspam


In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=43)

- Check counts label

In [None]:
print("train_df", train_df['label'].value_counts())
print("val_df", val_df['label'].value_counts())

label
notspam    151
spam        17
Name: count, dtype: int64
label
notspam    42
spam        1
Name: count, dtype: int64


- Convert dataframe to dataset

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset= Dataset.from_pandas(val_df)
dataset = Dataset.from_pandas(df)

# Load Model From Hugging Face Hub

In [None]:
MODEL_ID = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

In [None]:
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-text-classification"
REPOSITORY_ID

'flan-t5-base-text-classification'

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=REPOSITORY_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,     # Overflows with fp16
    learning_rate=1e-3,
    num_train_epochs=3,
    logging_dir=f"{REPOSITORY_ID}/logs",    # logging & evaluation strategies
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

In [None]:
def preprocess_function(examples):
    inputs = [f"Classify as spam or notspam: {text}" for text in examples["text"]]
    targets = examples["label"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=8, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
def postprocess_text(preds: List[str], labels: List[str]) -> Tuple[List[str], List[str]]:
    """ helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculate accuracy
    accuracy = accuracy_score(decoded_labels, decoded_preds)

    # Calculate precision, recall, and F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(decoded_labels, decoded_preds, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [None]:
# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/168 [00:00<?, ? examples/s]



Map:   0%|          | 0/43 [00:00<?, ? examples/s]

In [None]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine_tuning Flan-T5

In [None]:
train_results = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.0525,0.017617,0.976744,0.954029,0.976744,0.965253
2,0.04,0.000693,1.0,1.0,1.0,1.0
3,0.0033,8.7e-05,1.0,1.0,1.0,1.0
4,0.002,1.5e-05,1.0,1.0,1.0,1.0


  _warn_prf(average, modifier, msg_start, len(result))
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


# Save model after fine_tuning

In [None]:
save_directory = "flan_t5_model_fine_tuned"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/content/drive/MyDrive/flan_t5_model_fine_tuned/tokenizer_config.json',
 '/content/drive/MyDrive/flan_t5_model_fine_tuned/special_tokens_map.json',
 '/content/drive/MyDrive/flan_t5_model_fine_tuned/spiece.model',
 '/content/drive/MyDrive/flan_t5_model_fine_tuned/added_tokens.json',
 '/content/drive/MyDrive/flan_t5_model_fine_tuned/tokenizer.json')

# Load fine-tuned-flan-t5-model

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Evaluate the model

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation metrics
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")

# Generate detailed classification report
predictions1, labels1, _1 = trainer.predict(val_dataset)
decoded_preds1 = tokenizer.batch_decode(predictions1, skip_special_tokens=True)
decoded_labels1 = tokenizer.batch_decode(labels1, skip_special_tokens=True)

report1 = classification_report(decoded_labels1, decoded_preds1, target_names=['notspam', 'spam'])
print(report1)



Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000




              precision    recall  f1-score   support

     notspam       1.00      1.00      1.00        42
        spam       1.00      1.00      1.00         1

    accuracy                           1.00        43
   macro avg       1.00      1.00      1.00        43
weighted avg       1.00      1.00      1.00        43



# Predict label for TestData_nolabel

Read all files from TestData_nolabel folder and predict each text

In [None]:
def classify(text_to_classify: str) -> str:
    """Classify a text using the model."""
    # Determine the device (GPU if available, otherwise CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Encode the input text and move inputs to the appropriate device
    inputs = tokenizer.encode_plus(text_to_classify, padding='max_length', max_length=512, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Move the model to the same device
    model.to(device)

    # Generate the output using the model
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=5)

    # Decode the generated output and classify
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if prediction.lower() == "spam":
        return "spam"
    else:
        return "notspam"

def predict_test_files(directory: str) ->str:
    rs = ""
    for i in range(0, 78):
        filename = os.path.join(directory, f"{i}_unknown.txt")
        with open(filename, "r", encoding='utf-8') as f:
            rs += f"{i}_unknown.txt" + "," + classify(f.read())+"\n"

    return rs
result_predict = predict_test_files("TestData_nolabel")

In [None]:
print(result_predict)

0_unknown.txt,notspam
1_unknown.txt,notspam
2_unknown.txt,notspam
3_unknown.txt,notspam
4_unknown.txt,notspam
5_unknown.txt,notspam
6_unknown.txt,notspam
7_unknown.txt,notspam
8_unknown.txt,notspam
9_unknown.txt,notspam
10_unknown.txt,notspam
11_unknown.txt,notspam
12_unknown.txt,notspam
13_unknown.txt,notspam
14_unknown.txt,notspam
15_unknown.txt,notspam
16_unknown.txt,notspam
17_unknown.txt,notspam
18_unknown.txt,notspam
19_unknown.txt,notspam
20_unknown.txt,notspam
21_unknown.txt,notspam
22_unknown.txt,notspam
23_unknown.txt,notspam
24_unknown.txt,notspam
25_unknown.txt,notspam
26_unknown.txt,notspam
27_unknown.txt,notspam
28_unknown.txt,notspam
29_unknown.txt,notspam
30_unknown.txt,notspam
31_unknown.txt,notspam
32_unknown.txt,notspam
33_unknown.txt,notspam
34_unknown.txt,notspam
35_unknown.txt,notspam
36_unknown.txt,notspam
37_unknown.txt,notspam
38_unknown.txt,notspam
39_unknown.txt,notspam
40_unknown.txt,notspam
41_unknown.txt,notspam
42_unknown.txt,notspam
43_unknown.txt,notspa

In [None]:
file1 = open("predict_label.txt", "a")
file1.write(result_predict)
file1.close()