In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from datasets import load_dataset
from transformers import AutoTokenizer
import pandas as pd
import re

# Load dataset
dataset = load_dataset("lex_glue", "unfair_tos")

# Convert to DataFrame
train_df = pd.DataFrame(dataset['train'])
validation_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])


In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from datasets import load_dataset
from transformers import AutoTokenizer
import re

# Load dataset
print("Loading dataset...")
dataset = load_dataset("lex_glue", "unfair_tos")

# Convert dataset to format suitable for training
def prepare_data_for_training(dataset):
    print("Converting dataset to arrays...")
    
    # Get all texts
    all_texts = dataset['text']
    
    # Tokenize all texts
    tokenized = tokenizer(all_texts, 
                         padding='max_length', 
                         truncation=True, 
                         max_length=256,
                         return_tensors='np')
    
    # Get input IDs
    input_ids = tokenized['input_ids']
    
    # Handle labels
    all_labels = []
    for item in dataset:
        # Convert each label list to a numpy array
        label_array = np.zeros(8)  # Initialize array with zeros
        for idx in item['labels']:  # item['labels'] contains indices of positive classes
            label_array[idx] = 1
        all_labels.append(label_array)
    
    # Convert to final numpy array
    labels = np.array(all_labels)
    
    print(f"Input shape: {input_ids.shape}")
    print(f"Labels shape: {labels.shape}")
    
    return input_ids, labels

# Initialize tokenizer
print("\nInitializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def create_bilstm_model(vocab_size, num_labels=8, embedding_dim=100, max_len=256):
    # Input layer
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    
    # Embedding layer
    embedding = layers.Embedding(vocab_size, embedding_dim, input_length=max_len)(input_ids)
    
    # Add dropout to prevent overfitting
    x = layers.Dropout(0.2)(embedding)
    
    # BiLSTM layers
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    
    # Dense layers
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    
    # Output layer
    output = layers.Dense(num_labels, activation='sigmoid')(x)
    
    # Create and compile model
    model = models.Model(inputs=input_ids, outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Create model
print("\nCreating model...")
model = create_bilstm_model(
    vocab_size=30522,  # BERT vocab size
    num_labels=8,      # Number of label categories
    embedding_dim=100, # Size of embedding vectors
    max_len=256       # Maximum sequence length
)

# Print model summary
model.summary()

# Prepare data
print("\nPreparing training data...")
train_features, train_labels = prepare_data_for_training(train_dataset)
print("\nPreparing validation data...")
val_features, val_labels = prepare_data_for_training(validation_dataset)
print("\nPreparing test data...")
test_features, test_labels = prepare_data_for_training(test_dataset)

# Create callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath='best_model.weights.h5',  # Fixed filepath
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2
    )
]

# Train the model
print("\nStarting training...")
history = model.fit(
    train_features,
    train_labels,
    validation_data=(val_features, val_labels),
    epochs=20,
    batch_size=32,
    callbacks=callbacks
)

# Evaluate on test set
print("\nEvaluating on test set...")
test_results = model.evaluate(test_features, test_labels)
print(f"Test loss: {test_results[0]:.4f}")
print(f"Test accuracy: {test_results[1]:.4f}")

# Function to make predictions
def make_predictions(text, model, tokenizer):
    # Preprocess the text
    cleaned_text = text.strip()
    
    # Tokenize
    tokens = tokenizer(
        cleaned_text,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='np'
    )
    
    # Make prediction
    prediction = model.predict(tokens['input_ids'])
    return prediction[0]

# Example prediction
sample_text = "Your data may be shared with third parties without explicit consent."
predictions = make_predictions(sample_text, model, tokenizer)
print("\nSample prediction:")
print("Text:", sample_text)
print("Predicted probabilities:", predictions)

Loading dataset...

Initializing tokenizer...

Creating model...



Preparing training data...
Converting dataset to arrays...
Input shape: (5532, 256)
Labels shape: (5532, 8)

Preparing validation data...
Converting dataset to arrays...
Input shape: (2275, 256)
Labels shape: (2275, 8)

Preparing test data...
Converting dataset to arrays...
Input shape: (1607, 256)
Labels shape: (1607, 8)

Starting training...
Epoch 1/20
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 764ms/step - accuracy: 0.1033 - loss: 0.4611 - val_accuracy: 0.0246 - val_loss: 0.0730 - learning_rate: 1.0000e-04
Epoch 2/20
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 694ms/step - accuracy: 0.2475 - loss: 0.1003 - val_accuracy: 0.0246 - val_loss: 0.0712 - learning_rate: 1.0000e-04
Epoch 3/20
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 685ms/step - accuracy: 0.2305 - loss: 0.0941 - val_accuracy: 0.9284 - val_loss: 0.0705 - learning_rate: 1.0000e-04
Epoch 4/20
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1

In [23]:
def make_predictions(text, model, tokenizer):
    # Mapping of indices to unfair clause categories
    category_mapping = {
        0: "Jurisdiction",
        1: "Choice of Law",
        2: "Limitation of Liability",
        3: "Unilateral Change",
        4: "Content Removal",
        5: "Contract by Using",
        6: "Unilateral Termination",
        7: "Arbitration"
    }
    
    # Tokenize
    tokens = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='np'
    )
    
    # Make prediction
    predictions = model.predict(tokens['input_ids'], verbose=0)
    probabilities = predictions[0]
    
    # Create readable output
    results = []
    for idx, prob in enumerate(probabilities):
        percentage = prob * 100
        category = category_mapping[idx]
        results.append({
            "category": category,
            "probability": percentage
        })
    
    # Sort by probability in descending order
    results.sort(key=lambda x: x["probability"], reverse=True)
    
    # Print formatted results
    print("\nPrediction Results:")
    print("-" * 50)
    for result in results:
        print(f"{result['category']:25} : {result['probability']:.2f}%")
    
    # Return the structured results
    return results

# Example usage
sample_text = "Your data may be shared with third parties without explicit consent."
results = make_predictions(sample_text, model, tokenizer)


Prediction Results:
--------------------------------------------------
Jurisdiction              : 0.17%
Limitation of Liability   : 0.16%
Content Removal           : 0.10%
Choice of Law             : 0.09%
Contract by Using         : 0.07%
Unilateral Change         : 0.07%
Arbitration               : 0.06%
Unilateral Termination    : 0.02%


In [3]:
# Inspect the dataset structure
print(dataset['train'].column_names)


['text', 'labels']


In [4]:
# Print the first few rows of the train dataset
print(dataset['train'][:5])


{'text': ['notice to california subscribers : you may cancel your subscription , without penalty or obligation , at any time prior to midnight of the third business day following the date you subscribed . \n', 'if you subscribed using your apple id , refunds are handled by apple , not tinder . \n', 'if you wish to request a refund , please visit https://getsupport.apple.com . \n', 'if you subscribed using your google play store account or through tinder online : contact customer support \n', "key changes in this version : we 've included a legal notice required under california law regarding refunds and updated our legal name to match group , llc \n"], 'labels': [[], [], [], [], []]}
