In [1]:
from tools.utils import *
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
import spacy

  from tqdm.autonotebook import tqdm, trange


# Prepare data for annotation

In [2]:
# prepare the spacy model and pipeline
nlp = spacy.load("en_core_web_sm")
nlp.disable_pipes("tagger", "parser", "attribute_ruler", "lemmatizer")
nlp.add_pipe('sentencizer')


<spacy.pipeline.sentencizer.Sentencizer at 0x29ce0bf80>

In [3]:
processed_data_dir = Path('processed_data')

In [4]:
embeddings = np.loadtxt(processed_data_dir / 'embedding.tsv')
metadata = pd.read_csv(processed_data_dir / 'metadata.tsv',sep='\t')
print(len(embeddings), len(metadata))

105245 105245


In [5]:
clause_type = 'arbitration' #'arbitration' | 'opt-out' | 'class_waver' | 'anti-scraping'
examples_df = pd.read_excel(f'annotations/{clause_type}_clauses.xlsx', sheet_name='Sheet1')

In [6]:
examples_df['processed_text'] = examples_df['Examples'].apply(lambda x: 'clustering: ' + replace_named_entities(nlp(x)))

In [7]:
examples_df['processed_text']

0     clustering: please read the following arbitrat...
1     clustering: [mask] [mask] [mask] applies to an...
2     clustering: in the unlikely event that [mask] ...
3     clustering: at the company ’s or your election...
4     clustering: you and [mask] agree that [mask] [...
5     clustering: any controversy or claim arising o...
6     clustering: you and [mask] each agree that any...
7     clustering: any dispute relating in any way to...
8     clustering: you and company agree that , [mask...
9     clustering: except as set forth in the paragra...
10    clustering: you and [mask] agree to resolve an...
11    clustering: you agree that any dispute between...
12    clustering: you and [mask] agree to resolve an...
13    clustering: either [mask] or you may demand th...
14    clustering: by agreeing to this agreement , yo...
15    clustering: you agree that all disputes betwee...
16    clustering: except as provided below , you and...
17    clustering: you agree that disputes betwee

In [8]:
# next line is to check if MPS is available
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
# comment next line if you are want to run code on a GPU
#device = 'cuda' if torch.cuda.is_available() else device

# load the nomic embed model
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True) # trust_remote_code is needed to use the encode method
# send the model to the device
#model.to(device)
#print(device)

<All keys matched successfully>


In [9]:
target_embeddings = model.encode(examples_df['processed_text'].tolist())

In [10]:
average_embedding = target_embeddings.mean(axis=0)

In [11]:
similarity_scores = cosine_similarity(embeddings, [average_embedding])
top_n_position = np.argpartition(similarity_scores.reshape(-1), -500)[-500:][::-1]
out_csv = pd.DataFrame(metadata.iloc[top_n_position].sentence.unique())
out_csv.columns = ['sentence']
out_csv['label'] = 0
#out_csv['similarity'] = similarity_scores[top_n_position]
out_csv.to_csv(f'annotations/{clause_type}_most_similar_to_average.csv')

In [12]:
similarity_scores = cosine_similarity(embeddings, target_embeddings)
similarity_scores = similarity_scores.max(axis=1)
top_n_position = np.argpartition(similarity_scores, -500)[-500:][::-1]
out_csv = pd.DataFrame(metadata.iloc[top_n_position].sentence.unique())
out_csv.columns = ['sentence']
out_csv['label'] = 0
#out_csv['similarity'] = similarity_scores[top_n_position]
out_csv.to_csv(f'annotations/{clause_type}_most_similar_to_individual.csv')

## Annotate selected examples with ChatGPT

In [13]:
# prepare examples for annotation
df1 = pd.read_csv(f'annotations/{clause_type}_most_similar_to_individual.csv', index_col=0)
df2 = pd.read_csv(f'annotations/{clause_type}_most_similar_to_average.csv', index_col=0)
print(len(df1) + len(df2))
df = pd.concat([df1, df2], axis=0).drop_duplicates(subset='sentence')
print(df.shape)


287
(184, 2)


In [14]:
# prepare prompts for annotation
sentences = df.sentence.to_list()
prompts =["""Below are ten examples of {0} clauses.\n\n{1}\n\n Is the following sentence a {0} clause? Please select only yes or no.
 {2}""".format(
     clause_type,examples_df['Examples'].sample(10).str.cat(sep='\n'),s)  for s in sentences]
print(len(prompts),prompts[0])

184 Below are ten examples of arbitration clauses.

All claims arising out of or relating to these Terms (including their formation, performance and breach), the parties’ relationship with each other, and/or your use of the Services (including the Site, the App, and any wagering transactions) shall be finally settled by binding arbitration
BY AGREEING TO THIS AGREEMENT, YOU AND GRINDR HEREBY IRREVOCABLY WAIVE ANY CONSTITUTIONAL AND STATUTORY RIGHTS TO SUE IN COURT AND HAVE A TRIAL IN FRONT OF A JUDGE OR A JURY (OTHER THAN SMALL CLAIMS COURT AS PROVIDED ABOVE. You and Grindr are instead electing that all Disputes shall be resolved by arbitration under this arbitration provision.
This Arbitration Agreement applies to any disputes or claims of any kind whatsoever (whether based in contract, tort, statute, regulation, ordinance, fraud, misrepresentation or any other legal or equitable theory) between you and the Bumble Group arising out of or relating to the Terms, prior versions of the Te

In [15]:
sentences = df.sentence.to_list()
syntethic_prompts =["""Below are ten examples of {0} clauses.\n\n{1}\n\nGenerate a new sentence similar but not identical to these examples and has the same function as {0} clause.
 """.format(
     clause_type,examples_df['Examples'].sample(10).str.cat(sep='\n'))  for _ in range(100)]
print(len(syntethic_prompts),syntethic_prompts[0])

100 Below are ten examples of arbitration clauses.

Except as set forth in the paragraph below, you agree that all claims and disputes between you and Facebook that arise out of or relate in any way to the Terms or your use of the Facebook Service will be resolved either by (a) binding arbitration by a single arbitrator in Santa Clara County, California or (b) binding non-appearance based arbitration conducted by telephone, online or based solely on written submission. 
For any claim (excluding claims for injunctive or other equitable relief) where the total amount of the award sought is less than $10,000, the party requesting relief may elect to resolve the dispute in a cost effective manner through binding non-appearance-based arbitration
Either Grindr or you may demand that any dispute or claim between Grindr and you about or involving the Grindr Services must be settled by arbitration utilizing the dispute resolution procedures of the American Arbitration Association
Any controvers

In [16]:
# Set up OpenAI API credentials
from openai import OpenAI
client = OpenAI()


def generate_response(prompt,model='gpt-4o-mini', max_tokens=100,temperature=.0):
    # Generate a response using OpenAI ChatGPT
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        n=1,
        stop=None,
        timeout=10
    )

    # Extract the generated response from the API response
    #generated_response = response.choices[0].text.strip()

    return response.choices[0].message.content



In [17]:
import openai
openai.__version__

'1.43.0'

In [None]:
responses = [(sentences[i],generate_response(p,model='gpt-4o',max_tokens=2)) for i, p in tqdm(enumerate(prompts))]
responses_df = pd.DataFrame(responses, columns=['text','response'])

In [None]:
responses_df['synthetic'] = 0

In [None]:
responses_df.value_counts('response')

In [None]:
responses_df.response.unique()


In [None]:
syntethic_data = [generate_response(p,max_tokens=200,temperature=.3) for i, p in tqdm(enumerate(syntethic_prompts))]

In [None]:
syntethic_data_df = pd.DataFrame(syntethic_data, columns=['text'])
syntethic_data_df['text'] = syntethic_data_df['text'].str.lower()
syntethic_data_df['labels'] = 1
syntethic_data_df['synthetic'] = 1
syntethic_data_df.iloc[1]

In [None]:
responses_df.replace({'response': {'Yes.': 1,'Yes': 1, 'No': 0,'No.': 0}}, inplace=True)
responses_df.rename(columns={'response': 'labels'}, inplace=True)


In [None]:
annotations_gpt = pd.concat([responses_df, syntethic_data_df], axis=0)

In [None]:
annotations_gpt.to_csv(f'annotations/{clause_type}_annotations_gpt4.csv')
annotations_gpt.shape

In [None]:
annotations_gpt

# Load annotations and train a classifier

In [None]:
import pandas as pd
from torch.utils.data import DataLoader
from pathlib import Path
import evaluate
from datasets import Dataset
from transformers import AdamW
from tqdm.auto import tqdm
from transformers import get_scheduler
from torch.nn.functional import softmax
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoTokenizer, DataCollatorWithPadding

In [None]:

processed_data_dir = Path('processed_data')
metadata = pd.read_csv(processed_data_dir / 'metadata.tsv',sep='\t')

In [None]:
# Load the data from CSV
annotator = '_gpt4'
df_annotations = pd.read_csv(f'annotations/{clause_type}_annotations{annotator}.csv', index_col=0) 
df_annotations

In [None]:
df_annotations.value_counts('labels')

In [None]:
df_sample = metadata.sample(n=200).reset_index(drop=True)
sents = df_sample.sentence.to_list()
sents = [s for s in sents if s not in df_annotations.text.to_list()]
df_sample = pd.DataFrame(sents, columns=['text'])
df_sample['labels'] = 0
df_sample.shape


In [None]:
data = pd.concat([df_annotations[['text','labels']], df_sample[['text','labels']] ], axis=0).reset_index(drop=True)

In [None]:
data

In [None]:


checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

dataset = Dataset.from_pandas(data)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
tokenized_datasets

In [None]:


train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [None]:


model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to('mps')


optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-6, weight_decay=0.2)



num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)



progress_bar = tqdm(range(num_training_steps))
metric = evaluate.load("glue", "mrpc")

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)




    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print(metric.compute())

In [None]:
model.save_pretrained(f"./{clause_type}_model")
tokenizer.save_pretrained(f"./{clause_type}_model")

# Apply classifier to all examples

In [None]:

model = AutoModelForSequenceClassification.from_pretrained(f'{clause_type}_model')
tokenizer = AutoTokenizer.from_pretrained(f'{clause_type}_model')

In [None]:
tqdm.pandas()
metadata['logits'] = metadata.progress_apply(lambda x: 
                    softmax(model(**tokenizer(x.sentence, return_tensors='pt', truncation=True)).logits.detach(), dim=1), 
                          axis=1)

In [None]:
metadata['logits'].iloc[0][0][1].item()

In [None]:
metadata['prob_1'] = metadata['logits'].apply(lambda x: x[0][1].item())

In [None]:
metadata['prob_1'].plot(kind='density')

In [None]:
metadata.sort_values('prob_1', ascending=False).head(10)

In [None]:
df_deduplicated = metadata.drop_duplicates(subset=['sentence'])
df_deduplicated['annotated'] = df_deduplicated.sentence.isin(df_annotations.text)
int_labels = [((0.95,1.0),'confident_positive'),( (0.80,.95), 'sure_positive'),((0.60,.80), 'leaning_positive'),
                   ((0.50,.60), 'borderline_positive'),((0.40,.50), 'borderline_negative'),
                   ((0.20,.40), 'leaning_negative'),((0.05,.20), 'sure_negative'),((0.0,.05), 'confident_negative')]
for interval, label in int_labels:


    df_deduplicated.loc[df_deduplicated.prob_1.between(*interval),'category']  = label



In [None]:
pd.concat([df_deduplicated[df_deduplicated.category == label].sample(10)
    for _ , label in int_labels], axis=0)[['sentence','category']].to_csv(f'annotations/{clause_type}_automatic_annotations_by_category.csv')


In [None]:
metadata[metadata.prob_1 > .5].to_csv(f'annotations/{clause_type}_inference.csv')

## Fin