In [None]:
!pip install transformers datasets torch accelerate




Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl

In [None]:
pip install transformers datasets torch scikit-learn google-colab wget




In [None]:

!pip install transformers datasets torch evaluate scikit-learn


import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


file_path = "/content/privacy_policy_large_dataset.csv"
dataset = pd.read_csv(file_path)

# Preprocess dataset
def preprocess_labels(df, threshold=3):
    """
    Convert category scores to binary labels (0 or 1) based on the threshold.
    Ensure labels are formatted as lists of floats.
    """
    df["labels"] = df.apply(
        lambda row: [
            float(row["clarity"] > threshold),
            float(row["transparency"] > threshold),
            float(row["accessibility"] > threshold),
            float(row["security"] > threshold),
            float(row["comprehensiveness"] > threshold),
        ],
        axis=1,
    )
    return df


dataset = preprocess_labels(dataset)


hf_dataset = Dataset.from_pandas(dataset[["text", "labels"]])

# Split into train and test sets
dataset_split = hf_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split["train"]
test_dataset = dataset_split["test"]

# Tokenizer setup
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
def tokenize_and_format(batch):
    tokenized = tokenizer(batch["text"], padding=True, truncation=True)
    tokenized["labels"] = batch["labels"]
    return tokenized

train_dataset = train_dataset.map(tokenize_and_format, batched=True)
test_dataset = test_dataset.map(tokenize_and_format, batched=True)

# Define the model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=5,
    problem_type="multi_label_classification",
)

# Define metrics for evaluation
def compute_metrics(pred):
    logits, labels = pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()


metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)


trainer.save_model("./fine_tuned_model")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

sample_texts = [
    "your data is secured and not shared.",
]
tokenized_samples = tokenizer(sample_texts, padding=True, truncation=True, return_tensors="pt")
tokenized_samples = {key: value.to(device) for key, value in tokenized_samples.items()}


output = model(**tokenized_samples)
predictions = torch.sigmoid(output.logits).detach().cpu().numpy()
print("Predictions:", predictions)





Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.288652,0.86,0.941906,1.0,0.968743
2,No log,0.167796,0.87,0.971654,1.0,0.984842
3,No log,0.125129,0.95,0.98815,1.0,0.993909
4,No log,0.106475,1.0,1.0,1.0,1.0
5,No log,0.100672,1.0,1.0,1.0,1.0


Evaluation Metrics: {'eval_loss': 0.10067180544137955, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.3806, 'eval_samples_per_second': 525.498, 'eval_steps_per_second': 34.157, 'epoch': 5.0}
Predictions: [[0.94735694 0.9572821  0.93929577 0.94128674 0.9507809 ]]


In [None]:
!pip install transformers huggingface_hub





In [None]:
from huggingface_hub import login


login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_path = "./fine_tuned_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


model.push_to_hub("ayaalhaj/privacy-policy-analyzer_multilabel")
tokenizer.push_to_hub("ayaalhaj/privacy-policy-analyzer_multilabel")


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ayaalhaj/privacy-policy-analyzer_multilabel/commit/05c8903cd1a1474b01f3ef3191e62cbd1f0ddb06', commit_message='Upload tokenizer', commit_description='', oid='05c8903cd1a1474b01f3ef3191e62cbd1f0ddb06', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ayaalhaj/privacy-policy-analyzer_multilabel', endpoint='https://huggingface.co', repo_type='model', repo_id='ayaalhaj/privacy-policy-analyzer_multilabel'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import snapshot_download


repo_id = "ayaalhaj/privacy-policy-analyzer_multilabel"
snapshot_download(repo_id=repo_id, local_dir="./transformers-js-model")


Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'/content/transformers-js-model'