# Distil-RoBERTa

In [1]:
# Packages
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
# NLTK for sentence tokenization
import nltk
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ijyli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Pre-provided Code

In [2]:
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [3]:
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences)
print('cosine similarity:', cos_sim(embeddings[0], embeddings[1]))

cosine similarity: tensor([[0.2988]])


## A Terms of Service Document

In [4]:
# Load in '../Text Data/Terms of Service Corpus/text/AppleServices_PrivacyPolicy.txt'
with open('../Text Data/Terms of Service Corpus/text/AppleServices_PrivacyPolicy.txt', 'r') as file:
    apple_privacy_policy_text = file.read()

# Convert unicode to ascii
apple_privacy_policy_text = apple_privacy_policy_text.encode('ascii', 'ignore').decode('ascii')

print(apple_privacy_policy_text)


         Privacy Policy The Apple Privacy Policy was updated on December 31, 2019.
Your privacy is important to Apple so weve developed a Privacy Policy that covers how we collect, use, disclose, transfer, and store your personal information.
In addition to this Privacy Policy, we provide data and privacy information imbedded in our products connected with our Data &amp.
Privacy Icon for certain features that ask to use your personal information.



You can review this information before enabling these features, in Settings related to those features and/or online at apple.com/legal/privacy.
Please take a moment to familiarize yourself with our privacy practices and contact us if you have any questions.

Your California Privacy Disclosures
Information Regarding Commercial Electronic Messages in Canada
Apple Health Research Apps Privacy Policy




 Collection and Use of PersonalInformation
Personal information is data that can be used to identify or contact a single person.
You may be a

In [5]:
# Sentence tokenize text
apple_sentences = nltk.sent_tokenize(apple_privacy_policy_text)
# Append "Apple" to the beginning of each sentence
apple_sentences = ['Apple ' + sentence for sentence in apple_sentences]
# # Split on \n as well
# split_on_newline = []
# for sentence in apple_sentences:
#     split_on_newline.extend(sentence.split('\n'))
# split_on_newline
apple_sentences

['Apple \n         Privacy Policy The Apple Privacy Policy was updated on December 31, 2019.',
 'Apple Your privacy is important to Apple so weve developed a Privacy Policy that covers how we collect, use, disclose, transfer, and store your personal information.',
 'Apple In addition to this Privacy Policy, we provide data and privacy information imbedded in our products connected with our Data &amp.',
 'Apple Privacy Icon for certain features that ask to use your personal information.',
 'Apple You can review this information before enabling these features, in Settings related to those features and/or online at apple.com/legal/privacy.',
 'Apple Please take a moment to familiarize yourself with our privacy practices and contact us if you have any questions.',
 'Apple Your California Privacy Disclosures\nInformation Regarding Commercial Electronic Messages in Canada\nApple Health Research Apps Privacy Policy\n\n\n\n\n Collection and Use of PersonalInformation\nPersonal information is d

In [6]:
# Encode
apple_embeddings = model.encode(apple_sentences)

In [7]:
# Basic search - embeddings most similar to query "Will Apple disclose non-personal information?"
query = "Will Apple disclose non-personal information?"
query_embedding = model.encode(query)

similarities = cos_sim(query_embedding, apple_embeddings)
print('similarities:', similarities)

# Top 5 most similar sentences
sims = similarities.tolist()[0]
print(sims)
top5_idx = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:5]
print(top5_idx)
for idx in top5_idx:
    print(apple_sentences[idx])
    print(sims[idx])

similarities: tensor([[0.5190, 0.6058, 0.6155, 0.4627, 0.5742, 0.6465, 0.4646, 0.7226, 0.7096,
         0.5025, 0.6357, 0.5928, 0.6975, 0.6630, 0.4704, 0.5351, 0.5881, 0.6134,
         0.2595, 0.5861, 0.6487, 0.5125, 0.5770, 0.4959, 0.6023, 0.4976, 0.6341,
         0.5774, 0.6284, 0.5272, 0.4798, 0.6240, 0.5080, 0.5019, 0.6333, 0.5696,
         0.5041, 0.5130, 0.4226, 0.4331, 0.3960, 0.7379, 0.6068, 0.5257, 0.4695,
         0.6202, 0.5512, 0.5084, 0.5751, 0.5543, 0.6788, 0.3079, 0.3747, 0.6991,
         0.6043, 0.6747, 0.3344, 0.3745, 0.3715, 0.3514, 0.4498, 0.5594, 0.4016,
         0.4927, 0.2755, 0.5802, 0.6312, 0.2267, 0.3358, 0.2600, 0.1927, 0.4243,
         0.4800, 0.5611, 0.4673, 0.6352, 0.3318, 0.2534, 0.4197, 0.3075, 0.2423,
         0.4222, 0.6295, 0.4960, 0.7116, 0.4985, 0.6407, 0.4343, 0.5734, 0.6529,
         0.7687, 0.7840, 0.5961, 0.6203, 0.4726, 0.3421, 0.5756, 0.3978, 0.6671,
         0.6536, 0.6727, 0.4066, 0.6061, 0.4917, 0.4058, 0.4737, 0.6467, 0.4650,
         0.519

In [8]:
# Basic search - embedding most similar to query "Is my privacy important to Apple?"
query = "Is my privacy important to Apple?"
query_embedding = model.encode(query)

similarities = cos_sim(query_embedding, apple_embeddings)
print('similarities:', similarities)

# Top 5 most similar sentences
sims = similarities.tolist()[0]
print(sims)
top5_idx = sorted(range(len(sims)), key=lambda i: sims[i], reverse=True)[:5]
print(top5_idx)
for idx in top5_idx:
    print(apple_sentences[idx])
    print(sims[idx])

similarities: tensor([[0.5079, 0.7376, 0.5850, 0.5126, 0.5818, 0.6851, 0.4637, 0.6751, 0.6373,
         0.4056, 0.5695, 0.4729, 0.5967, 0.5065, 0.4194, 0.5704, 0.5442, 0.5591,
         0.3365, 0.5717, 0.6882, 0.4826, 0.5403, 0.4222, 0.5690, 0.3495, 0.5820,
         0.6023, 0.5605, 0.4120, 0.3696, 0.4831, 0.5084, 0.4685, 0.5618, 0.4778,
         0.3948, 0.4040, 0.3150, 0.3586, 0.3595, 0.6463, 0.4585, 0.4015, 0.4035,
         0.5486, 0.4949, 0.4122, 0.5382, 0.5470, 0.5603, 0.3465, 0.3736, 0.6562,
         0.5435, 0.6084, 0.2771, 0.4106, 0.3613, 0.3581, 0.4405, 0.5795, 0.4403,
         0.4299, 0.2682, 0.5364, 0.6196, 0.3299, 0.3618, 0.3477, 0.2729, 0.3992,
         0.3969, 0.4475, 0.4553, 0.5202, 0.2541, 0.2019, 0.3598, 0.3584, 0.1958,
         0.3919, 0.5434, 0.4285, 0.6489, 0.4238, 0.6578, 0.4277, 0.6129, 0.5721,
         0.7152, 0.7136, 0.4854, 0.5334, 0.5595, 0.3729, 0.5454, 0.3665, 0.6243,
         0.6359, 0.5602, 0.4708, 0.5562, 0.4620, 0.4045, 0.5180, 0.6534, 0.4417,
         0.610