In [1]:
import pandas as pd
# show full text
pd.set_option('display.max_colwidth', None)

In [2]:
claudette = pd.read_csv('../combined.csv')
ToSDR = pd.read_csv('../tosdr_data.csv')

claudette = claudette[["clause", "severity"]]
claudette = claudette[(claudette.severity == 1) | (claudette.severity == 2) | (claudette.severity == 3)]
#rename columns
claudette = claudette.rename(columns={"clause": "title", "severity": "classification"})
ToSDR = ToSDR[["title", "classification"]]

In [3]:
claudette
# 1 -> clearly fair (good)
# 2 -> potentially unfair (nuetral)
# 3 -> clearly unfair (bad)


Unnamed: 0,title,classification
43,"By signing up or otherwise using the Spotify service, websites, and software applications (together, the “Spotify Service” or “Service”), or accessing any content or material that is made available by Spotify through the Service (the “Content”) you are entering into a binding contract with the Spotify entity indicated at the bottom of this document.",2
75,"Occasionally we may, in our discretion, make changes to the Agreements.",2
76,"In some cases, we will notify you in advance, and your continued use of the Service after the changes have been made will constitute your acceptance of the changes.",2
97,"<ter3><ltd2>Spotify reserves the right, in its absolute discretion, to determine your eligibility for a Trial, and, subject to applicable laws, to withdraw or to modify a Trial at any time without prior notice and with no liability, to the greatest extent permitted under the law.</ltd2></ter3>",2
125,"In all cases, Spotify reserves the right to remove or disable access to any User Content for any or no reason, including but not limited to, User Content that, in Spotify’s sole discretion, violates the Agreements.",3
...,...,...
12289,All disputes or claims arising out of or relating to these Terms of Use shall be subject to the exclusive jurisdiction of the English Courts to which the Parties irrevocably submit.,3
12298,"Vivino reserves the right, at its sole discretion, to modify or replace any of these Terms of Use, or change, suspend, or discontinue the Service (including without limitation, the availability of any feature, database, or content) at any time by posting a notice on the Site or by sending you notice through the Service or via email.",2
12299,<ltd2>Vivino may also impose limits on certain features and services or restrict your access to parts or all of the Service without notice or liability.</ltd2>,2
12300,Your continued use of the Service following the posting of any changes to these Terms of Use constitutes acceptance of those changes.,2


In [4]:
#in ToSDR, rename good, bad, and neutral to 1, 3, and 2 respectively drop the rest

ToSDR["classification"] = ToSDR["classification"].replace("good", 1)
ToSDR["classification"] = ToSDR["classification"].replace("bad", 3)
ToSDR["classification"] = ToSDR["classification"].replace("neutral", 2)
ToSDR = ToSDR[(ToSDR.classification == 1) | (ToSDR.classification == 2) | (ToSDR.classification == 3)]
ToSDR

Unnamed: 0,title,classification
0,You can delete your content from this service,1
1,A license is kept on user-generated content even after you close your account,3
2,This service tracks you on other websites,3
3,You agree to comply with the law of the service's country,2
4,This service retains rights to your content even after you stop using your account,3
...,...,...
246,This Service does not keep any logs.,1
248,"You agree to defend, indemnify, and hold the service harmless in case of a claim related to your use of the service",2
249,This service assumes no liability for any losses or damages resulting from any matter relating to the service,2
250,Information is provided about what kind of information they collect,1


In [5]:
# concat both dataframes

frames = [claudette, ToSDR]
result = pd.concat(frames)
result.reset_index(drop=True, inplace=True)
result

Unnamed: 0,title,classification
0,"By signing up or otherwise using the Spotify service, websites, and software applications (together, the “Spotify Service” or “Service”), or accessing any content or material that is made available by Spotify through the Service (the “Content”) you are entering into a binding contract with the Spotify entity indicated at the bottom of this document.",2
1,"Occasionally we may, in our discretion, make changes to the Agreements.",2
2,"In some cases, we will notify you in advance, and your continued use of the Service after the changes have been made will constitute your acceptance of the changes.",2
3,"<ter3><ltd2>Spotify reserves the right, in its absolute discretion, to determine your eligibility for a Trial, and, subject to applicable laws, to withdraw or to modify a Trial at any time without prior notice and with no liability, to the greatest extent permitted under the law.</ltd2></ter3>",2
4,"In all cases, Spotify reserves the right to remove or disable access to any User Content for any or no reason, including but not limited to, User Content that, in Spotify’s sole discretion, violates the Agreements.",3
...,...,...
1236,This Service does not keep any logs.,1
1237,"You agree to defend, indemnify, and hold the service harmless in case of a claim related to your use of the service",2
1238,This service assumes no liability for any losses or damages resulting from any matter relating to the service,2
1239,Information is provided about what kind of information they collect,1


In [6]:
X = result["title"]
y = result["classification"]

In [7]:
# make model to predict classification based on title

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Convert labels to integers
y = y.astype(int)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# create pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None))])

# train model
text_clf.fit(X_train, y_train)

# predict
predictions = text_clf.predict(X_test)

# evaluate
print("Accuracy: ", accuracy_score(y_test, predictions))

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

# cross validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(text_clf, X, y, cv=5)
print("Cross Validation Scores: ", scores)
print("Mean Cross Validation Score: ", scores.mean())


Accuracy:  0.8192771084337349
[[ 10  11   7]
 [  1 176   3]
 [  1  22  18]]
              precision    recall  f1-score   support

           1       0.83      0.36      0.50        28
           2       0.84      0.98      0.90       180
           3       0.64      0.44      0.52        41

    accuracy                           0.82       249
   macro avg       0.77      0.59      0.64       249
weighted avg       0.81      0.82      0.80       249

Cross Validation Scores:  [0.7751004  0.81854839 0.80241935 0.75806452 0.60887097]
Mean Cross Validation Score:  0.7526007254825754


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

train_features, train_labels = extract_features(train_data_loader, bert_model, device)
test_features, test_labels = extract_features(test_data_loader, bert_model, device)

# Train a simple classifier on the extracted features
clf = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
clf.fit(train_features, train_labels)

# Predict and evaluate
predictions = clf.predict(test_features)
print("Accuracy: ", accuracy_score(test_labels, predictions))
print(confusion_matrix(test_labels, predictions))
print(classification_report(test_labels, predictions))

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.11/3.11.4_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/spawn.py", line 120, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.4_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/multiprocessing/spawn.py", line 130, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'ToSDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 