In [1]:
import random

indexToLabel = {0: 'Amazon Pay', 1: 'Device Address', 2: 'Device country and postal code', 
                3: 'Email Address', 4: 'Location Services', 5: 'Mobile Number', 6: 'Name', 
                7: 'Personal Information', 8: 'Skill Personisation', 9:'None'}
labelToindex = {v:k for (k,v) in indexToLabel.items()}

print(indexToLabel)

{0: 'Amazon Pay', 1: 'Device Address', 2: 'Device country and postal code', 3: 'Email Address', 4: 'Location Services', 5: 'Mobile Number', 6: 'Name', 7: 'Personal Information', 8: 'Skill Personisation', 9: 'None'}


# Prepare dataset

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score
import pickle

'''
prepare dataset
1. Jide Alexa's excel file
'''
path = 'data/Annotated_Policies_Alexa.csv'
df = pd.read_csv(path)
df = df[pd.notnull(df['Permission'])]
df = df[pd.notnull(df['Sentence'])]
df.drop('Similarities_Score',axis=1,inplace=True)


for c in df['Permission'].unique():
    df.loc[ (df['Permission'] == c), 'Label'] = labelToindex[c]
df['Label'] = np.int64(df['Label'])

for index in indexToLabel.keys():
    df.loc[df['Label'] == index, 'Label'+str(index)] = 0
    df.loc[df['Label'] != index, 'Label'+str(index)] = 1

In [3]:
df.head()

Unnamed: 0,Permission,Sentence,Label,Label0,Label1,Label2,Label3,Label4,Label5,Label6,Label7,Label8,Label9
0,Amazon Pay,"when you make a donation, amazon pay will proc...",0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Amazon Pay,"however, the amazon pay a-to-z guarantee cover...",0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,Amazon Pay,some other alexa skills allow you to purchase ...,0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,Amazon Pay,we will ask for standard credit card informati...,0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,Amazon Pay,transaction information : payment information ...,0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
df.loc[df['Sentence'] == 'zip code']

Unnamed: 0,Permission,Sentence,Label,Label0,Label1,Label2,Label3,Label4,Label5,Label6,Label7,Label8,Label9
467,Device country and postal code,zip code,2,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Classifier for class 'Amazon Pay' - 0.95+

In [5]:
columnToUse = 'Label0'

classproportion = 2
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_alpha = 0.001
svm_loss = 'modified_huber'

In [6]:
X0 = df.loc[df[columnToUse] == 0]['Sentence'].tolist()
X0 = X0*4
xn0 = len(X0)
X1 = df.loc[df[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

X = X0+X1

ORIGINAL DATASET SAMPLING
	 692 sentences of class 0;  10371 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 90 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 692 sentences of class 0;  1384 sentence for class 1


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, random_state=42,
                          max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.30, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

>Train: 0=484, 1=969, Test: 0=208, 1=415
CM [[208   0]
 [  0 415]]
F1 1.0
Acc 1.0


### Test without ignoring 'None' sentences

In [49]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = df.loc[df['Permission'] != 'None', 'Sentence']
#y = df.loc[df['Permission'] != 'None', columnToUse]
X0 = df.loc[df[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = df.loc[df[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')

#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 10544 instances to classify,  173 for class 0,  10371 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 90 repeated sentences from X1
REMOVING SENTENCES USED TO TRAIN THE MODEL THAT BELONG TO CLASS 0
	Removed 1385 sentences used to train the model; sentences from test set X
CM [[  21    0]
 [  65 8983]]
F1 0.3925233644859813
Acc 0.9928327268717609


### Test ignoring None sentences

In [50]:

#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = df.loc[df['Permission'] != 'None', 'Sentence']
#y = df.loc[df['Permission'] != 'None', columnToUse]
X0 = df.loc[(df[columnToUse] == 0)&(df['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = df.loc[(df[columnToUse] == 1)&(df['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]


#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4079 instances to classify,  173 for class 0,  3906 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 90 repeated sentences from X1
REMOVING SENTENCES USED TO TRAIN THE MODEL THAT BELONG TO CLASS 0
	Removed 652 sentences used to train the model; sentences from test set X
CM [[  21    0]
 [  49 3267]]
F1 0.4615384615384615
Acc 0.9853161522325442


# Classifier for class 'Device Address' - 0.80

In [9]:
categoryname = 'Device Address'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 3
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.001


Device Address  -  1  -  Label1


In [10]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

print('INCREASING DATASET WITH APP350')
relevantfiles = ['data/App350/Contact_Postal_Address_3rdParty.txt', 'data/App350/Contact_Postal_Address_1stParty.txt']
for filepath in relevantfiles:
    with open(filepath, encoding='utf8') as f:   #utf8      
        text = f.readlines()
        text = [k.replace('\n', '').strip() for k in text]
        print('\t', len(text), 'new sentences found in ', filepath)
        for t in text:
            dfaux = dfaux.append({'Permission':categoryname, 'Sentence':t, 'Label':labelindex, columnToUse:0},  ignore_index=True)
            
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
X0 = X0*2
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1

ORIGINAL DATASET SAMPLING
	 279 sentences of class 0;  10265 sentence for class 1
INCREASING DATASET WITH APP350
	 79 new sentences found in  data/App350/Contact_Postal_Address_3rdParty.txt
	 431 new sentences found in  data/App350/Contact_Postal_Address_1stParty.txt
	 1578 sentences of class 0;  10265 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 256 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 1578 sentences of class 0;  4734 sentence for class 1


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

>Train: 0=1499, 1=4497, Test: 0=79, 1=237
CM [[ 75   4]
 [ 12 225]]
F1 0.9036144578313253
Acc 0.9493670886075949
*Model for  Label1 saved in  models/Label1_new_model.pkl


### Test without ignoring 'None' sentences

In [12]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 11054 instances to classify,  789 for class 0,  10265 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 256 repeated sentences from X1
CM [[ 784    5]
 [ 534 9475]]
F1 0.7441860465116279
Acc 0.9500833487682904


### Test ignoring None sentences

In [13]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4589 instances to classify,  789 for class 0,  3800 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 252 repeated sentences from X1
CM [[ 784    5]
 [ 503 3045]]
F1 0.7552986512524086
Acc 0.8828683421720083


# Classifier for class 'Device country and postal code' - 0.88

In [7]:
categoryname = 'Device country and postal code'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 2
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Device country and postal code  -  2  -  Label2


In [8]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

print('INCREASING DATASET WITH APP350')
relevantfiles = ['data/App350/Contact_City_1stParty.txt',
                 'data/App350/Contact_City_3rdParty.txt',
                 'data/App350/Contact_ZIP_1stParty.txt',
                 'data/App350/Contact_ZIP_3rdParty.txt']
for filepath in relevantfiles:
    with open(filepath, encoding='utf8') as f:   #utf8      
        text = f.readlines()
        text = [k.replace('\n', '').strip() for k in text]
        print('\t', len(text), 'new sentences found in ', filepath)
        for t in text:
            dfaux = dfaux.append({'Permission':categoryname, 'Sentence':t, 'Label':labelindex, columnToUse:0},  ignore_index=True)
            
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
X0 = X0*4
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1

ORIGINAL DATASET SAMPLING
	 251 sentences of class 0;  10293 sentence for class 1
INCREASING DATASET WITH APP350
	 93 new sentences found in  data/App350/Contact_City_1stParty.txt
	 19 new sentences found in  data/App350/Contact_City_3rdParty.txt
	 140 new sentences found in  data/App350/Contact_ZIP_1stParty.txt
	 22 new sentences found in  data/App350/Contact_ZIP_3rdParty.txt
	 2100 sentences of class 0;  10293 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 92 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 2100 sentences of class 0;  4200 sentence for class 1


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

>Train: 0=965, 1=1930, Test: 0=51, 1=102
CM [[51  0]
 [ 4 98]]
F1 0.9622641509433962
Acc 0.9738562091503268
*Model for  Label2 saved in  models/Label2_new_model.pkl


### Test without ignoring 'None' sentences

In [13]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 10547 instances to classify,  254 for class 0,  10293 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 92 repeated sentences from X1
CM [[  254     0]
 [  170 10031]]
F1 0.7492625368731564
Acc 0.983739837398374


### Test ignoring 'None' sentences

In [14]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4082 instances to classify,  254 for class 0,  3828 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 91 repeated sentences from X1
CM [[ 254    0]
 [ 145 3592]]
F1 0.7779479326186829
Acc 0.9636682535705337


# Classifier for class 'Email Address' - 0.9+

In [24]:
categoryname = 'Email Address'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 2
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Email Address  -  3  -  Label3


In [25]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

print('INCREASING DATASET WITH APP350')
relevantfiles = ['data/App350/Contact_E_Mail_Address_1stParty.txt',
                 'data/App350/Contact_E_Mail_Address_3rdParty.txt']
for filepath in relevantfiles:
    with open(filepath, encoding='utf8') as f:   #utf8      
        text = f.readlines()
        text = [k.replace('\n', '').strip() for k in text]
        print('\t', len(text), 'new sentences found in ', filepath)
        for t in text:
            dfaux = dfaux.append({'Permission':categoryname, 'Sentence':t, 'Label':labelindex, columnToUse:0},  ignore_index=True)
            
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
#X0 = X0*2
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1

ORIGINAL DATASET SAMPLING
	 232 sentences of class 0;  10312 sentence for class 1
INCREASING DATASET WITH APP350
	 1804 new sentences found in  data/App350/Contact_E_Mail_Address_1stParty.txt
	 172 new sentences found in  data/App350/Contact_E_Mail_Address_3rdParty.txt
	 2208 sentences of class 0;  10312 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 270 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 2208 sentences of class 0;  4416 sentence for class 1


In [26]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

>Train: 0=2097, 1=4195, Test: 0=111, 1=221
CM [[ 97  14]
 [ 13 208]]
F1 0.8778280542986425
Acc 0.9186746987951807
*Model for  Label3 saved in  models/Label3_new_model.pkl


### Test without ignoring 'None' sentences

In [27]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 12520 instances to classify,  2208 for class 0,  10312 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 270 repeated sentences from X1
CM [[2191   17]
 [ 498 9544]]
F1 0.8948335715744333
Acc 0.9579591836734694


### Test ignoring 'None' sentences

In [28]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 6055 instances to classify,  2208 for class 0,  3847 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 270 repeated sentences from X1
CM [[2191   17]
 [ 288 3289]]
F1 0.9349263921484958
Acc 0.9472774416594641


# Classifier for class 'Location Services' - 0.9+

In [40]:
categoryname = 'Location Services'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 4
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Location Services  -  4  -  Label4


In [41]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

print('INCREASING DATASET WITH APP350')
relevantfiles = ['data/App350/Location_GPS_1stParty.txt',
                 'data/App350/Location_GPS_3rdParty.txt']
for filepath in relevantfiles:
    with open(filepath, encoding='utf8') as f:   #utf8      
        text = f.readlines()
        text = [k.replace('\n', '').strip() for k in text]
        print('\t', len(text), 'new sentences found in ', filepath)
        for t in text:
            dfaux = dfaux.append({'Permission':categoryname, 'Sentence':t, 'Label':labelindex, columnToUse:0},  ignore_index=True)
            
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
#X0 = X0*2
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1

ORIGINAL DATASET SAMPLING
	 327 sentences of class 0;  10524 sentence for class 1
INCREASING DATASET WITH APP350
	 278 new sentences found in  data/App350/Location_GPS_1stParty.txt
	 69 new sentences found in  data/App350/Location_GPS_3rdParty.txt
	 674 sentences of class 0;  10524 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 110 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 674 sentences of class 0;  2696 sentence for class 1


In [42]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

>Train: 0=640, 1=2561, Test: 0=34, 1=135
CM [[ 28   6]
 [  0 135]]
F1 0.9032258064516129
Acc 0.9644970414201184
*Model for  Label4 saved in  models/Label4_new_model.pkl


### Test without ignoring 'None' sentences

In [43]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 11198 instances to classify,  674 for class 0,  10524 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 110 repeated sentences from X1
CM [[  668     6]
 [   45 10369]]
F1 0.9632299927901946
Acc 0.9954004329004329


### Test ignoring 'None' sentences

In [44]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4745 instances to classify,  674 for class 0,  4071 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 109 repeated sentences from X1
CM [[ 668    6]
 [  22 3940]]
F1 0.9794721407624634
Acc 0.993960310612597


# Classifier for class 'Mobile Number' - 0.9+

In [13]:
categoryname = 'Mobile Number'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 2
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Mobile Number  -  5  -  Label5


In [14]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

print('INCREASING DATASET WITH APP350')
relevantfiles = ['data/App350/Contact_Phone_Number_1stParty.txt',
                 'data/App350/Contact_Phone_Number_3rdParty.txt']
for filepath in relevantfiles:
    with open(filepath, encoding='utf8') as f:   #utf8      
        text = f.readlines()
        text = [k.replace('\n', '').strip() for k in text]
        print('\t', len(text), 'new sentences found in ', filepath)
        for t in text:
            dfaux = dfaux.append({'Permission':categoryname, 'Sentence':t, 'Label':labelindex, columnToUse:0},  ignore_index=True)
            
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
X0 = X0*2
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1




from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

ORIGINAL DATASET SAMPLING
	 97 sentences of class 0;  10447 sentence for class 1
INCREASING DATASET WITH APP350
	 772 new sentences found in  data/App350/Contact_Phone_Number_1stParty.txt
	 72 new sentences found in  data/App350/Contact_Phone_Number_3rdParty.txt
	 1882 sentences of class 0;  10447 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 197 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 1882 sentences of class 0;  3764 sentence for class 1
>Train: 0=1788, 1=3575, Test: 0=94, 1=189
CM [[ 94   0]
 [  6 183]]
F1 0.9690721649484536
Acc 0.9787985865724381
*Model for  Label5 saved in  models/Label5_new_model.pkl


### Without ignoring None sentence

In [15]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 11388 instances to classify,  941 for class 0,  10447 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 197 repeated sentences from X1
CM [[ 940    1]
 [ 270 9980]]
F1 0.8740120874012087
Acc 0.9757841122330444


### Ignoring None sentences

In [16]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
#X = dfaux.loc[dfaux['Permission'] != 'None', 'Sentence']
#y = dfaux.loc[dfaux['Permission'] != 'None', columnToUse]
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4923 instances to classify,  941 for class 0,  3982 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 193 repeated sentences from X1
CM [[ 940    1]
 [ 225 3564]]
F1 0.8926875593542261
Acc 0.9522198731501057


# Classifier for class 'Name' - 0.9+

In [5]:
categoryname = 'Name'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 5
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Name  -  6  -  Label6


In [7]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1




from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

ORIGINAL DATASET SAMPLING
	 424 sentences of class 0;  10120 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 334 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 424 sentences of class 0;  2120 sentence for class 1
>Train: 0=403, 1=2013, Test: 0=21, 1=107
CM [[ 17   4]
 [  1 106]]
F1 0.8717948717948718
Acc 0.9609375
*Model for  Label6 saved in  models/Label6_new_model.pkl


### Without ignoring None sentences

In [8]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 10544 instances to classify,  424 for class 0,  10120 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 334 repeated sentences from X1
CM [[ 420    4]
 [ 148 9638]]
F1 0.8467741935483871
Acc 0.9851126346718903


### Ignoring None sentences

In [9]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4079 instances to classify,  424 for class 0,  3655 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 330 repeated sentences from X1
CM [[ 420    4]
 [ 133 3192]]
F1 0.8597748208802457
Acc 0.9634569218458255


# Classifier for class 'Personal Information' - 99+

In [53]:
categoryname = 'Personal Information'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 1
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Personal Information  -  7  -  Label7


In [55]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1




from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

ORIGINAL DATASET SAMPLING
	 2367 sentences of class 0;  8484 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 357 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 2367 sentences of class 0;  2367 sentence for class 1
>Train: 0=2249, 1=2248, Test: 0=118, 1=119
CM [[118   0]
 [  1 118]]
F1 0.9957805907172996
Acc 0.9957805907172996
*Model for  Label7 saved in  models/Label7_new_model.pkl


### Without ignoring None sentences

In [56]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 10851 instances to classify,  2367 for class 0,  8484 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 357 repeated sentences from X1
CM [[2367    0]
 [  54 8073]]
F1 0.9887218045112781
Acc 0.9948542024013722


### Ignoring None sentences

In [57]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4398 instances to classify,  2367 for class 0,  2031 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 349 repeated sentences from X1
CM [[2367    0]
 [  52 1630]]
F1 0.9891349770162976
Acc 0.9871573227957521


# Classifier for class 'Skill Personisation' - 0.99+

In [58]:
categoryname = 'Skill Personisation'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 4
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

Skill Personisation  -  8  -  Label8


In [59]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
X0 = X0 * 10
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1




from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.05, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

ORIGINAL DATASET SAMPLING
	 970 sentences of class 0;  10754 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 16 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 1 is larger than 0, applying subsampling
	 970 sentences of class 0;  3880 sentence for class 1
>Train: 0=921, 1=3686, Test: 0=49, 1=194
CM [[ 49   0]
 [  0 194]]
F1 1.0
Acc 1.0
*Model for  Label8 saved in  models/Label8_new_model.pkl


### Without ignoring None sentences

In [60]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 10851 instances to classify,  97 for class 0,  10754 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 16 repeated sentences from X1
CM [[   97     0]
 [    1 10737]]
F1 0.9948717948717948
Acc 0.9999077065066913


### Ignoring None sentences

In [61]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[(dfaux[columnToUse] == 0)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[(dfaux[columnToUse] == 1)&(dfaux['Permission'] != 'None')]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 4398 instances to classify,  97 for class 0,  4301 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 16 repeated sentences from X1
CM [[  97    0]
 [   0 4285]]
F1 1.0
Acc 1.0


# Classifier for class 'None' - 0.99 
training with 80% of class 1, and 3k instnces of class 0 (which has almost 30k)

In [15]:
categoryname = 'None'
labelindex = labelToindex[categoryname]
columnToUse = 'Label'+str(labelindex)
print(categoryname, ' - ', labelindex, ' - ', columnToUse)

classproportion = 1
ngrams = (1,3)
apply_tfidf = False
apply_binary = True
svm_loss = 'modified_huber'
svm_alpha = 0.0001

None  -  9  -  Label9


In [16]:
dfaux = df

X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
X0 = X0 * 10
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)

print('ORIGINAL DATASET SAMPLING')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')

#print('INCREASING DATASET WITH APP350')
#relevantfiles = ['data/App350/Contact_Phone_Number_1stParty.txt',
#                 'data/App350/Contact_Phone_Number_3rdParty.txt']
#for filepath in relevantfiles:
#    with open(filepath, encoding='utf8') as f:   #utf8      
#        text = f.readlines()
#        text = [k.replace('\n', '').strip() for k in text]
#        print('\t', len(text), 'new sentences found in ', filepath)
#        for t in text:
#            dfaux = dfaux.append({'Permission':categoryname, 'Sentence':t, 'Label':labelindex, columnToUse:0},  ignore_index=True)
#            
#X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
#X0 = X0*20
#xn0 = len(X0)
#X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
#xn1 = len(X1)
#print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
            
            
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')

#subsampling (random) of largest class
print('CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS')
if(len(X1)>len(X0)*classproportion):
    X1 = random.sample(X1, xn0*classproportion)
    xn1 = len(X1)
    print('\tclass 1 is larger than 0, applying subsampling')
elif(len(X0)>len(X1)*classproportion):
    X0 = random.sample(X0, xn1*classproportion)
    xn0 = len(X0)
    print('\tclass 0 is larger than 1')
print('\t',xn0, 'sentences of class 0; ', xn1, 'sentence for class 1')
X = X0+X1
xn0 = len(X0)
xn1 = len(X1)



from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])

labels = xn0*[0] + xn1*[1]
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.20, stratify=labels)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

#save model
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(text_clf, file)
    print('*Model for ', columnToUse, 'saved in ', pkl_filename)

ORIGINAL DATASET SAMPLING
	 64650 sentences of class 0;  4082 sentence for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 13 repeated sentences from X1
CHECKING THE PROPORTION OF INSTANCES IN EACH CLASS
	class 0 is larger than 1
	 4082 sentences of class 0;  4082 sentence for class 1
>Train: 0=3265, 1=3255, Test: 0=817, 1=814
CM [[815   2]
 [ 11 803]]
F1 0.9920876445526476
Acc 0.9920294297976702
*Model for  Label9 saved in  models/Label9_new_model.pkl


### Without ignoring None sentences

In [17]:
#Load classifier for 'amazon pay' and test it on the entire dataset
pkl_filename = 'models/'+columnToUse+"_new_model.pkl"
with open(pkl_filename, 'rb') as file:
    text_clf = pickle.load(file)
    
#get the data
#ignoring None sentences
X0 = dfaux.loc[dfaux[columnToUse] == 0]['Sentence'].tolist()
xn0 = len(X0)
X1 = dfaux.loc[dfaux[columnToUse] == 1]['Sentence'].tolist()
xn1 = len(X1)
print('ORIGINAL DATASET SAMPLING')
print('\t',len(X0+X1), 'instances to classify, ',xn0, 'for class 0, ', xn1, 'for class 1' )
print('REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0')
#then check for repeated sentences belonging to both X0 and X1. If it belongs to X0, we remove it from X1.
#This happens, sentences are repeated in the dataset as belonging to more than one privacy category, hence we clean it
#before letting the classifier learn
totalr = 0
for s0 in X0:
    s0 = s0.strip()
    ir = [index for index, s in enumerate(X1) if s.strip()==s0]
    ir = sorted(ir, reverse = True) #need to sort in reverse so when deleting an index does not affect the rest of indices
    for index in ir:
        del X1[index]
    totalr += len(ir)
print('\tRemoved', totalr, 'repeated sentences from X1')
X = X0+X1
xn0 = len(X0);xn1 = len(X1)
y = xn0*[0] + xn1*[1]

#test the data
pred_y = text_clf.predict(X)
cm = confusion_matrix(y, pred_y)
f1 = f1_score(y, pred_y, average=None)[0]
acc = np.mean(pred_y == y)

print('CM', cm)
print('F1', f1)
print('Acc', acc)

ORIGINAL DATASET SAMPLING
	 10547 instances to classify,  6465 for class 0,  4082 for class 1
REMOVING REPEATED SENTENCES FROM CLASS 1 THAT ALSO BELONG TO CLASS 0
	Removed 13 repeated sentences from X1
CM [[6426   39]
 [  11 4058]]
F1 0.9961246318400248
Acc 0.9952534649705714


# Quick classifier test

In [18]:
X = [
    'zip code',
    'zip',
    'zip address',
    'email',
    'other',
    'name'
]

Y = [1,1,1,0,0,0]

In [19]:
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=ngrams, binary=apply_binary)),
    ('tfidf', TfidfTransformer(use_idf=apply_tfidf)),
    ('clf', SGDClassifier(loss=svm_loss, penalty='l2',
                          alpha=svm_alpha, max_iter=10000, tol=None,  class_weight='balanced')),
    ])
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, stratify=Y)

#print proportion for training and testing
X_train=np.array(X_train);X_test=np.array(X_test);y_train=np.array(y_train);y_test=np.array(y_test);
train_0, train_1 = len(y_train[y_train==0]), len(y_train[y_train==1])
test_0, test_1 = len(y_test[y_test==0]), len(y_test[y_test==1])
print('>Train: 0=%d, 1=%d, Test: 0=%d, 1=%d' % (train_0, train_1, test_0, test_1))


#train and test
text_clf.fit(X_train, y_train)          
pred_y = text_clf.predict(X_test)
#plot_confusion_matrix(clf, test_X, test_y)
cm = confusion_matrix(y_test, pred_y)
f1 = f1_score(y_test, pred_y, average=None)[0]
acc = np.mean(pred_y == y_test) 

print('CM', cm)
print('F1', f1)
print('Acc', acc)

>Train: 0=2, 1=2, Test: 0=1, 1=1
CM [[1 0]
 [0 1]]
F1 1.0
Acc 1.0


In [40]:
test_sentences = ['we collect zip addres', 'postal address', 'zip code', 'email', 'test']
pred_y = text_clf.predict(test_sentences)
print(pred_y)

[1 1 1 0 0]
