In [42]:
import pandas as pd
import numpy as np
import preprocess

In [43]:
col_names = ['Data Retention', 'Data Security', 'Do Not Track',
       'First Party Collection/Use', 'International and Specific Audiences',
       'Introductory/Generic', 'Policy Change', 'Practice not covered',
       'Privacy contact information', 'Third Party Sharing/Collection',
       'User Access, Edit and Deletion', 'User Choice/Control']

In [44]:
path = union_path = r'C:\Users\IsmailKaraman\workspace\data\privacy_policy_data\OPP-115_v2\Polisis_Benchmark-master\datasets\majority.csv'

In [45]:
df = pd.read_csv(path)

In [9]:
df.head(3)

Unnamed: 0,text,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Introductory/Generic,Policy Change,Practice not covered,Privacy contact information,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,"""""Contact Us"""" Link If you contact us through...",0,0,0,1,0,0,0,0,0,0,0,0
1,(b) Information automatically collected There...,0,0,0,1,0,0,0,0,0,0,0,0
2,(ii) You have entered a contest or sweepstake...,0,0,0,0,0,0,0,0,0,1,0,0


In [58]:
data_dist = df[col_names].sum()
data_dist

Data Retention                            78
Data Security                            207
Do Not Track                              31
First Party Collection/Use              1181
International and Specific Audiences     296
Introductory/Generic                     378
Policy Change                            116
Practice not covered                     129
Privacy contact information              202
Third Party Sharing/Collection           931
User Access, Edit and Deletion           147
User Choice/Control                      352
dtype: int64

In [14]:
df.shape

(3399, 13)

In [18]:
# preprocessing text data
df['text'] = df['text'].apply(preprocess.preprocess_text)

In [24]:
X = df['text']
y = df[col_names]

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X).toarray()

# a base classifier

In [73]:
def print_losses(y_test, preds, clf):
     
    print("\033[1m" + clf + ' results: ' + "\033[0m")
    print('----------------------')
    hamLoss = hamming_loss(y_test.values, preds)
    print('hamLoss: {:.2f}'.format(hamLoss))
    acc_score = accuracy_score(y_test.values, preds)
    print('Exact Match Ratio: {:.2f}'.format(acc_score))
    print('-----------------------------------------------')
    print("\033[1m" + 'Classification Report' + "\033[0m")
    print(classification_report(y_test.values, preds, target_names=list(y_test.columns)))
    print('--------------------------------------------------------------------------------------------')
    print()

In [74]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [77]:
# Linear SVM
svmClassifier = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier.fit(X_train, y_train.values)
svmPreds = svmClassifier.predict(X_test)
print_losses(y_test, svmPreds, 'SVM Classifier')

# Random Forest
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_train, y_train.values)
rfc_preds = rfc.predict(X_test)
print_losses(y_test, rfc_preds, 'Random Forest Classifier')

[1mSVM Classifier results: [0m
----------------------
hamLoss: 0.04
Exact Match Ratio: 0.61
-----------------------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.67      0.18      0.29        11
                       Data Security       0.93      0.64      0.76        44
                        Do Not Track       1.00      0.75      0.86         8
          First Party Collection/Use       0.84      0.80      0.82       237
International and Specific Audiences       0.95      0.88      0.91        60
                Introductory/Generic       0.77      0.55      0.64        75
                       Policy Change       0.95      0.78      0.86        27
                Practice not covered       0.17      0.04      0.07        23
         Privacy contact information       0.84      0.66      0.74        32
      Third Party Sharing/Collection       0.84

  _warn_prf(average, modifier, msg_start, len(result))


[1mRandom Forest Classifier results: [0m
----------------------
hamLoss: 0.05
Exact Match Ratio: 0.51
-----------------------------------------------
[1mClassification Report[0m
                                      precision    recall  f1-score   support

                      Data Retention       0.00      0.00      0.00        11
                       Data Security       1.00      0.45      0.62        44
                        Do Not Track       1.00      0.12      0.22         8
          First Party Collection/Use       0.89      0.76      0.82       237
International and Specific Audiences       1.00      0.52      0.68        60
                Introductory/Generic       0.92      0.32      0.48        75
                       Policy Change       1.00      0.59      0.74        27
                Practice not covered       0.00      0.00      0.00        23
         Privacy contact information       1.00      0.50      0.67        32
      Third Party Sharing/Collection 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# K-means

In [None]:
df_clustering = df.copy()

In [100]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=12, random_state=0).fit(X)
df_clustering['kmeans_label'] = kmeans.labels_

array([1, 6, 9, ..., 3, 4, 0])

In [108]:
df_clustering['kmeans_label'].value_counts()

4     936
3     346
1     319
9     312
5     305
8     289
6     244
2     190
0     163
7     133
11    110
10     52
Name: kmeans_label, dtype: int64

In [102]:
data_dist

Data Retention                            78
Data Security                            207
Do Not Track                              31
First Party Collection/Use              1181
International and Specific Audiences     296
Introductory/Generic                     378
Policy Change                            116
Practice not covered                     129
Privacy contact information              202
Third Party Sharing/Collection           931
User Access, Edit and Deletion           147
User Choice/Control                      352
dtype: int64

In [103]:
for i in range(12):
    print(df_kmeans[df_kmeans['kmeans_label']==i][col_names].sum())
    print('-'*50)

Data Retention                           0
Data Security                            1
Do Not Track                             1
First Party Collection/Use              47
International and Specific Audiences     1
Introductory/Generic                     8
Policy Change                            1
Practice not covered                     7
Privacy contact information              1
Third Party Sharing/Collection          93
User Access, Edit and Deletion           3
User Choice/Control                     33
dtype: int64
--------------------------------------------------
Data Retention                            6
Data Security                             3
Do Not Track                              0
First Party Collection/Use               56
International and Specific Audiences      7
Introductory/Generic                      6
Policy Change                             2
Practice not covered                      6
Privacy contact information             106
Third Party Sharing/Coll

# LDA topic models

In [104]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=12, random_state=0)
df_clustering['lda_labels'] = lda.fit_transform(X).argmax(axis=1)

In [109]:
df_clustering['lda_labels'].value_counts()

2     1191
5      853
11     613
10     301
3      140
6      123
7       72
1       68
8       12
4       11
0        9
9        6
Name: lda_labels, dtype: int64

In [105]:
for i in range(12):
    print(df_clustering[df_clustering['lda_labels']==i][col_names].sum())
    print('-'*50)

Data Retention                          1
Data Security                           0
Do Not Track                            0
First Party Collection/Use              3
International and Specific Audiences    1
Introductory/Generic                    2
Policy Change                           0
Practice not covered                    0
Privacy contact information             2
Third Party Sharing/Collection          0
User Access, Edit and Deletion          1
User Choice/Control                     0
dtype: int64
--------------------------------------------------
Data Retention                           1
Data Security                           42
Do Not Track                             0
First Party Collection/Use               2
International and Specific Audiences    13
Introductory/Generic                     5
Policy Change                            0
Practice not covered                     1
Privacy contact information              5
Third Party Sharing/Collection           1
Us

# SVD

In [110]:
from sklearn.decomposition import TruncatedSVD

In [None]:
X = csr_matrix(X_dense)

In [111]:
svd = TruncatedSVD(n_components=12, n_iter=7, random_state=42)

In [113]:
aa = svd.fit_transform(X)

In [115]:
aa[2]

array([ 0.31054595, -0.01281775, -0.11068104,  0.26388503,  0.02019724,
        0.06006941, -0.03946336,  0.06881817, -0.01857787, -0.08771558,
       -0.02826877,  0.02575193])

In [94]:
print(svd.explained_variance_ratio_)

print(svd.explained_variance_ratio_.sum())

print(svd.singular_values_)

array([0.01111642, 0.01111646, 0.02454429, 0.0111171 , 0.01111659,
       0.8316533 , 0.01111735, 0.01111643, 0.01111973, 0.01112826,
       0.04370891, 0.01114516])

In [None]:
hierarchical clustering

In [None]:
Search for text clustering

In [None]:
search for multi-label clustering

In [16]:
df

Unnamed: 0,text,Data Retention,Data Security,Do Not Track,First Party Collection/Use,International and Specific Audiences,Introductory/Generic,Policy Change,Practice not covered,Privacy contact information,Third Party Sharing/Collection,"User Access, Edit and Deletion",User Choice/Control
0,contact us link contact us contact us link sit...,0,0,0,1,0,0,0,0,0,0,0,0
1,information automatically collect circumstance...,0,0,0,1,0,0,0,0,0,0,0,0
2,ii enter contest sweepstakes sponsor third par...,0,0,0,0,0,0,0,0,0,1,0,0
3,web beacon military web page web page partner ...,0,0,0,1,0,0,0,0,0,1,0,0
4,information collect collect follow type inform...,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3394,may combine information collect receive forego...,0,0,0,0,0,0,0,1,0,0,0,0
3395,good faith belief emergency pose threat health...,0,0,0,0,0,0,0,0,0,1,0,0
3396,good faith belief require disclose information...,0,0,0,0,0,0,0,0,0,1,0,0
3397,believe sit service use commission crime inclu...,0,0,0,0,0,0,0,0,0,1,0,0
