In [1]:
import pandas as pd
import numpy as np
import preprocess
import matplotlib.pyplot as plt

In [2]:
col_names = ['Data Retention', 'Data Security', 'Do Not Track',
       'First Party Collection/Use', 'International and Specific Audiences',
       'Introductory/Generic', 'Policy Change', 'Practice not covered',
       'Privacy contact information', 'Third Party Sharing/Collection',
       'User Access, Edit and Deletion', 'User Choice/Control']

In [3]:
majority_path = r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\majority.csv'

In [4]:
df = pd.read_csv(majority_path)

In [5]:
df.head(3)

Unnamed: 0,text,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Introductory/Generic,Policy Change,Practice not covered,Privacy contact information,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,"""""Contact Us"""" Link If you contact us through...",0,0,0,1,0,0,0,0,0,0,0,0
1,(b) Information automatically collected There...,0,0,0,1,0,0,0,0,0,0,0,0
2,(ii) You have entered a contest or sweepstake...,0,0,0,0,0,0,0,0,0,1,0,0


In [6]:
data_dist = df[col_names].sum()
data_dist

Data Retention                            78
Data Security                            207
Do Not Track                              31
First Party Collection/Use              1181
International and Specific Audiences     296
Introductory/Generic                     378
Policy Change                            116
Practice not covered                     129
Privacy contact information              202
Third Party Sharing/Collection           931
User Access, Edit and Deletion           147
User Choice/Control                      352
dtype: int64

In [7]:
# preprocessing text data
df['preprocessed_text'] = df['text'].apply(preprocess.preprocess_text)

In [10]:
X = df['preprocessed_text']
y = df[col_names]

# bag of words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X).toarray()

In [12]:
len(vectorizer.get_feature_names())

3686

In [13]:
from nltk.corpus import words

In [14]:
corpus = vectorizer.get_feature_names()

In [15]:
set(corpus)

{'connectivity',
 'america',
 'effectuate',
 'mapping',
 'loss',
 'readership',
 'campaigns',
 'nais',
 'address',
 'ohio',
 'offensive',
 'sexual',
 'disclosure',
 'dlc',
 'coverage',
 'subcontractor',
 'barrier',
 'constantly',
 'sustain',
 'excite',
 'fridays',
 'interview',
 'entry',
 'divestiture',
 'concert',
 'willfully',
 'june',
 'dangerous',
 'dollar',
 'technician',
 'vault',
 'ass',
 'habit',
 'present',
 'upload',
 'accepted',
 'permanent',
 'female',
 'removal',
 'solid',
 'educate',
 'divulge',
 'level',
 'destination',
 'position',
 'vikingsnewsletter',
 'enthusiast',
 'usability',
 'transact',
 'collaborator',
 'update',
 'limiting',
 'leisure',
 'power',
 'effectiveness',
 'specialized',
 'guarantor',
 'requirement',
 'terminate',
 'elapsed',
 'directed',
 'like',
 'check',
 'pursue',
 'click',
 'delaware',
 'interchange',
 'concepts',
 'kid',
 'measurement',
 'issuer',
 'apparent',
 'terminal',
 'memory',
 'share',
 'united',
 'way',
 'spam',
 'cleartrust',
 'fax',
 

In [16]:
difference = set(corpus).difference(set(words.words()))
intersection = set(corpus).intersection(set(words.words()))

In [17]:
len(difference), len(intersection), len(corpus)

(853, 2833, 3686)

In [18]:
difference

{'aaa',
 'aaas',
 'aboutadsinfo',
 'abused',
 'acc',
 'accessed',
 'accessing',
 'acknowledgement',
 'acknowledgment',
 'acsi',
 'acted',
 'acura',
 'adap',
 'adchoices',
 'addtionally',
 'adheres',
 'admin',
 'administers',
 'administrated',
 'adopts',
 'adsense',
 'adults',
 'advertised',
 'advertisements',
 'advertize',
 'adzerk',
 'affiliated',
 'africa',
 'aggregated',
 'agrees',
 'ahfc',
 'airline',
 'alerting',
 'alexandria',
 'alienware',
 'allstate',
 'alphanumeric',
 'alphanumerical',
 'altered',
 'america',
 'american',
 'analyzed',
 'anonymize',
 'anonymized',
 'answered',
 'anticipated',
 'anytime',
 'aol',
 'api',
 'apis',
 'app',
 'apples',
 'applications',
 'applies',
 'apps',
 'april',
 'archived',
 'arises',
 'arizona',
 'arlington',
 'artists',
 'artwork',
 'asked',
 'asks',
 'aspects',
 'assigns',
 'att',
 'attendee',
 'attendees',
 'attitudes',
 'attn',
 'audiencemanager',
 'audiences',
 'auditing',
 'austin',
 'australia',
 'automate',
 'automated',
 'autoplay',
 

In [38]:
with open('difference.txt', 'w') as f:
    for item in difference:
        f.write("%s\n" % item)

In [23]:
lemmatizer = WordNetLemmatizer()

In [25]:
lemm = [lemmatizer.lemmatize(word) for word in difference]

In [26]:
set(lemm).intersection(set(words.words()))

{'achievement',
 'acquisition',
 'activist',
 'activity',
 'actor',
 'ad',
 'addition',
 'adjuster',
 'adjustment',
 'administrator',
 'admission',
 'adult',
 'advertisement',
 'advertiser',
 'advisor',
 'affair',
 'agency',
 'agent',
 'aggregator',
 'agreement',
 'allegation',
 'allergy',
 'alliance',
 'alternative',
 'amendment',
 'anecdote',
 'annotation',
 'announcement',
 'apple',
 'applicant',
 'application',
 'appointment',
 'approval',
 'area',
 'arrangement',
 'art',
 'artist',
 'aspect',
 'assessment',
 'association',
 'assurance',
 'atlantic',
 'attitude',
 'audience',
 'auditor',
 'authority',
 'auto',
 'avenue',
 'backup',
 'banner',
 'barrier',
 'belief',
 'bit',
 'bookmark',
 'bookseller',
 'boy',
 'brochure',
 'brokerage',
 'browser',
 'bureau',
 'business',
 'buyer',
 'calculator',
 'camera',
 'cancellation',
 'capability',
 'car',
 'cardholder',
 'carrier',
 'category',
 'characteristic',
 'child',
 'choice',
 'circumstance',
 'citizen',
 'classified',
 'clause',
 'cl

In [17]:
difference

{'interfaces',
 'webbeacons',
 'ios',
 'acknowledgment',
 'autoplay',
 'employers',
 'worldwide',
 'valueclick',
 'telecollege',
 'disney',
 'cybersavvy',
 'lenders',
 'divestitures',
 'inquiries',
 'topics',
 'crimes',
 'tcp',
 'initiatives',
 'methods',
 'hasoffers',
 'toolbar',
 'webpage',
 'hd',
 'functionalities',
 'laboratories',
 'wilmington',
 'discussions',
 'citizens',
 'scorecard',
 'kid',
 'firewall',
 'webmasters',
 'westport',
 'companiesexcept',
 'acsi',
 'metro',
 'offhttp',
 'productions',
 'websites',
 'adsense',
 'mclean',
 'cpl',
 'timestamp',
 'info',
 'morehead',
 'mfa',
 'minnesota',
 'snea',
 'notifications',
 'guidelines',
 'languages',
 'metadata',
 'cvv',
 'october',
 'facts',
 'trustmark',
 'enhancements',
 'mozilla',
 'penalties',
 'contributions',
 'alliances',
 'monday',
 'usa',
 'interactions',
 'consumers',
 'earlier',
 'javascript',
 'overs',
 'buymytronics',
 'app',
 'kelowna',
 'employees',
 'anecdotes',
 'procedures',
 'omissions',
 'tangeroutlets',

In [None]:
for row in df['text']:
    print('len of row', len(row))
    intersection = set(row).intersection(set(words.words()))
    print('len of words', len([word for word in row if word in intersection]))
    print('----------------------------------------------')

len of row 102
len of words 89
----------------------------------------------
len of row 337
len of words 295
----------------------------------------------
len of row 312
len of words 272
----------------------------------------------
len of row 630
len of words 544
----------------------------------------------
len of row 20
len of words 18
----------------------------------------------
len of row 48
len of words 42
----------------------------------------------
len of row 310
len of words 269
----------------------------------------------
len of row 307
len of words 263
----------------------------------------------
len of row 19
len of words 17
----------------------------------------------
len of row 125
len of words 111
----------------------------------------------
len of row 638
len of words 558
----------------------------------------------
len of row 159
len of words 139
----------------------------------------------
len of row 652
len of words 568
---------------------------

len of words 230
----------------------------------------------
len of row 245
len of words 214
----------------------------------------------
len of row 164
len of words 143
----------------------------------------------
len of row 273
len of words 237
----------------------------------------------
len of row 99
len of words 88
----------------------------------------------
len of row 621
len of words 539
----------------------------------------------
len of row 140
len of words 120
----------------------------------------------
len of row 278
len of words 245
----------------------------------------------
len of row 707
len of words 620
----------------------------------------------
len of row 198
len of words 172
----------------------------------------------
len of row 232
len of words 200
----------------------------------------------
len of row 306
len of words 269
----------------------------------------------
len of row 143
len of words 125
-------------------------------------

len of words 160
----------------------------------------------
len of row 303
len of words 265
----------------------------------------------
len of row 328
len of words 288
----------------------------------------------
len of row 194
len of words 172
----------------------------------------------
len of row 113
len of words 100
----------------------------------------------
len of row 85
len of words 74
----------------------------------------------
len of row 170
len of words 150
----------------------------------------------
len of row 104
len of words 91
----------------------------------------------
len of row 181
len of words 159
----------------------------------------------
len of row 179
len of words 158
----------------------------------------------
len of row 362
len of words 318
----------------------------------------------
len of row 247
len of words 217
----------------------------------------------
len of row 280
len of words 245
--------------------------------------

len of words 113
----------------------------------------------
len of row 197
len of words 176
----------------------------------------------
len of row 136
len of words 119
----------------------------------------------
len of row 192
len of words 163
----------------------------------------------
len of row 263
len of words 231
----------------------------------------------
len of row 139
len of words 123
----------------------------------------------
len of row 227
len of words 199
----------------------------------------------
len of row 477
len of words 417
----------------------------------------------
len of row 496
len of words 437
----------------------------------------------
len of row 362
len of words 316
----------------------------------------------
len of row 138
len of words 123
----------------------------------------------
len of row 293
len of words 258
----------------------------------------------
len of row 375
len of words 330
-----------------------------------

len of words 189
----------------------------------------------
len of row 345
len of words 303
----------------------------------------------
len of row 430
len of words 377
----------------------------------------------
len of row 60
len of words 53
----------------------------------------------
len of row 263
len of words 233
----------------------------------------------
len of row 81
len of words 73
----------------------------------------------
len of row 212
len of words 188
----------------------------------------------
len of row 339
len of words 302
----------------------------------------------
len of row 264
len of words 238
----------------------------------------------
len of row 164
len of words 146
----------------------------------------------
len of row 139
len of words 120
----------------------------------------------
len of row 86
len of words 77
----------------------------------------------
len of row 178
len of words 159
-----------------------------------------

len of words 96
----------------------------------------------
len of row 510
len of words 444
----------------------------------------------
len of row 224
len of words 198
----------------------------------------------
len of row 419
len of words 371
----------------------------------------------
len of row 147
len of words 131
----------------------------------------------
len of row 413
len of words 357
----------------------------------------------
len of row 131
len of words 113
----------------------------------------------
len of row 341
len of words 297
----------------------------------------------
len of row 235
len of words 208
----------------------------------------------
len of row 87
len of words 76
----------------------------------------------
len of row 123
len of words 107
----------------------------------------------
len of row 368
len of words 313
----------------------------------------------
len of row 310
len of words 277
--------------------------------------

len of words 42
----------------------------------------------
len of row 353
len of words 308
----------------------------------------------
len of row 222
len of words 194
----------------------------------------------
len of row 268
len of words 238
----------------------------------------------
len of row 184
len of words 162
----------------------------------------------
len of row 468
len of words 407
----------------------------------------------
len of row 235
len of words 206
----------------------------------------------
len of row 229
len of words 201
----------------------------------------------
len of row 141
len of words 123
----------------------------------------------
len of row 682
len of words 601
----------------------------------------------
len of row 221
len of words 194
----------------------------------------------
len of row 307
len of words 270
----------------------------------------------
len of row 204
len of words 175
------------------------------------

len of words 137
----------------------------------------------
len of row 171
len of words 147
----------------------------------------------
len of row 193
len of words 169
----------------------------------------------
len of row 26
len of words 23
----------------------------------------------
len of row 279
len of words 245
----------------------------------------------
len of row 328
len of words 287
----------------------------------------------
len of row 230
len of words 202
----------------------------------------------
len of row 58
len of words 51
----------------------------------------------
len of row 253
len of words 225
----------------------------------------------
len of row 802
len of words 706
----------------------------------------------
len of row 145
len of words 127
----------------------------------------------
len of row 142
len of words 126
----------------------------------------------
len of row 634
len of words 560
---------------------------------------

In [31]:
intersection

{'family',
 'frame',
 'eliminate',
 'specificity',
 'reception',
 'correction',
 'securely',
 'regulate',
 'collectively',
 'hotel',
 'locally',
 'automobile',
 'conspicuous',
 'main',
 'fulfillment',
 'materially',
 'practitioner',
 'entertainment',
 'obtain',
 'mean',
 'reduction',
 'reveal',
 'mutually',
 'fact',
 'label',
 'geo',
 'inundate',
 'integrity',
 'random',
 'fortune',
 'rest',
 'regulation',
 'days',
 'highly',
 'weather',
 'numerical',
 'priest',
 'conformance',
 'involvement',
 'kiosk',
 'broadcast',
 'decorate',
 'judicial',
 'confine',
 'type',
 'venture',
 'motley',
 'correct',
 'cash',
 'congress',
 'pay',
 'daughter',
 'inspect',
 'revision',
 'zip',
 'love',
 'dissemination',
 'sex',
 'accessibility',
 'avenue',
 'mind',
 'chapter',
 'contradict',
 'short',
 'mortgage',
 'interpret',
 'reminder',
 'layout',
 'independent',
 'premier',
 'nat',
 'lend',
 'internally',
 'consideration',
 'strong',
 'implementation',
 'tamper',
 'throw',
 'incur',
 'yahoo',
 'residen

# Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X).toarray()

# Word2vec

In [None]:
from transformers import GensimWord2VecVectorizer

gensim_word2vec_tr = GensimWord2VecVectorizer(size=50, min_count=3, sg=1, alpha=0.025, iter=10)

In [None]:
gensim_word2vec_tr.fit

# glove

# Bert

# Fastext

# Elmo

# XLNet

# Transformers 