# Test all ML methods to classify user policies

In [73]:
import os, re, pickle, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_csv('./Cleaned_Segments_Annotations.csv', index_col=0)
df.head()

Unnamed: 0,segment_id,text,annotations,policy
0,0,PRIVACY POLICY This privacy policy (hereafter ...,{},1
1,1,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,{},1
2,2,'2. THE INFORMATION WE COLLECT The information...,{'Identifier_Cookie_or_similar_Tech_1stParty':...,1
3,3,"'2.2 In addition, we store certain information...",{'Identifier_Cookie_or_similar_Tech_1stParty':...,1
4,4,(c) to remember your preferences and registrat...,{},1


In [8]:
df.iloc[1]['text']

'1. ABOUT OUR PRODUCTS 1.1 Our products offer a diverse, current, and    exciting mix of games created by 6677g, as well as games created by independent    developers and 6677g partners. Players can access our products to play games without    registering; however, they may choose to register to create a public or semi-public    profile and to save games, high scores, and comments on this profile. Also, visitors    to our products will be able to rate games and browse for the newly added, top-rated,    and most-popular games.'

In [7]:
df.iloc[2]['text']

"'2. THE INFORMATION WE COLLECT The information that our products collect    includes (among others) the following: A) IP ADDRESS, COOKIES, AND WEB BEACONS    2.1 When you visit our products, our servers automatically save your computer''s    IP address. IP addresses will be collected, along with information about the actual    web pages that you visit on our products. If you arrive at our products via a    link from another product, the URL of the linking product and the URL of any product    that you link to next will also be collected.'"

In [6]:
df.iloc[2]['annotations']

"{'Identifier_Cookie_or_similar_Tech_1stParty': 'PERFORMED', 'Identifier_IP_Address_1stParty': 'PERFORMED'}"

## Labeling Strategy

### Segments

+ TRY multilable on full segments

### Sentences

+ Try multilabel on individual sentences

+ Break down into sentences
+ Anything with an annotation becomes a sentence to identify policy
+ Anything without an annotation is split into sentences and tagged as 'non-policy'

**TRY SKLEARN Multilabel classification**

+ this works 
+ but should we do full documents, segments, or sentences?

In [10]:
# read in policy names
with open('../pickles/policy_list.pckl', 'rb') as p:
    policy_list = pickle.load(p)
    
print(len(policy_list))

58


In [15]:
modality_labels = ['PERFORMED','NOT_PERFORMED','NO_POLICY_MENTIONED']

In [12]:
policy_list[0:2]

['Contact_1stParty', 'Contact_3rdParty']

In [24]:
def create_target_array(policy_list):
    #for i in range(1,n+1):
    for i in policy_list:
        yield i+'_PERFORMED'
        yield i+'_NOT_PERFORMED'

policy_targets = list(create_target_array(policy_list))
policy_targets.append('NO_POLICY_MENTIONED')
len(policy_targets)

117

In [28]:
policy_targets[-3:]

['SSO_PERFORMED', 'SSO_NOT_PERFORMED', 'NO_POLICY_MENTIONED']

In [36]:
d = {}
if len(d)==0:
    print('moo')

moo


In [78]:
def create_targets_from_annotation(annotation_dict, policies=policy_targets):
    #current_target = np.zeros(len(policy_list)*2+1)
    current_target = np.zeros(len(policies))
    policies = np.array(policies)
    if len(annotation_dict) == 0:
        current_target[-1] = 1
        return current_target
    else:
        #print(annotation_dict)
        for policy in annotation_dict:
            #print(policy)
            target_name = policy + '_' + annotation_dict[policy]
            i=np.where(policies == target_name)
            current_target[i] = 1
            #print(target_name)
    return current_target

In [79]:
df.columns

Index(['segment_id', 'text', 'annotations', 'policy'], dtype='object')

In [80]:
df['target'] = df.annotations.apply(lambda x: create_targets_from_annotation(json.loads(x.replace("'", "\"")), policies=policy_targets))

In [88]:
df.iloc[2]['target']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [91]:
len(df)

15507

In [96]:
#with open('../pickles/df_multilabel_target.pckl', 'wb') as p:
#    pickle.dump(df, p, protocol=pickle.HIGHEST_PROTOCOL)

In [97]:
with open('../pickles/df_multilabel_target.pckl', 'rb') as p:
    df_unpickled = pickle.load(p)

In [98]:
df_unpickled.shape

(15507, 5)

In [100]:
df_unpickled.iloc[0]['target']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [93]:
df.shape

(15507, 5)

In [103]:
df.head()

Unnamed: 0,segment_id,text,annotations,policy,target
0,0,PRIVACY POLICY This privacy policy (hereafter ...,{},1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,{},1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,'2. THE INFORMATION WE COLLECT The information...,{'Identifier_Cookie_or_similar_Tech_1stParty':...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,"'2.2 In addition, we store certain information...",{'Identifier_Cookie_or_similar_Tech_1stParty':...,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,(c) to remember your preferences and registrat...,{},1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [112]:
df = df.reset_index(drop=True)
df = df[['text', 'target']]
df.head()

Unnamed: 0,text,target
0,PRIVACY POLICY This privacy policy (hereafter ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1. ABOUT OUR PRODUCTS 1.1 Our products offer a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,'2. THE INFORMATION WE COLLECT The information...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"'2.2 In addition, we store certain information...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,(c) to remember your preferences and registrat...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## NLP PIPELINE

In [114]:
def standardize_text(df, text_field):
    '''Clean-up text column to prepare for tokenization
    
    Removes unwanted characters &
    Replaces them with spaces or blanks
    --
    Input
    + pandas dataframe
    + name of text column
    
    Returns
    + pandas dataframe with cleaned column
    '''
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df

In [115]:
df_nlp = df.copy()
df_nlp = standardize_text(df_nlp, 'text')
df_nlp.head()

Unnamed: 0,text,target
0,privacy policy this privacy policy (hereafter ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1 about our products 1 1 our products offer a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,'2 the information we collect the information...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"'2 2 in addition, we store certain information...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,(c) to remember your preferences and registrat...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
# serialize df_nlp

In [136]:
df_nlp_no_negative_class = df_nlp.copy()
df_nlp_no_negative_class['target'] = df_nlp_no_negative_class['target'].apply(lambda x: x[:-1])
#df_nlp_no_negative_class.iloc[0]['target']

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [137]:
#with open('../pickles/df_nlp_multilabel_target.pckl', 'wb') as p:
#    pickle.dump(df_nlp, p, protocol=pickle.HIGHEST_PROTOCOL)
    
#with open('../pickles/df_nlp_no_negative_multilabel_target.pckl', 'wb') as p:
#    pickle.dump(df_nlp_no_negative_class, p, protocol=pickle.HIGHEST_PROTOCOL)

In [134]:
len(df_nlp.iloc[0]['target'])
#len(df_nlp.iloc[0]['target'][:-1])

117

In [116]:
list_corpus = df_nlp['text'].tolist()
list_labels = df_nlp['target'].tolist()

print('X: {}'.format(len(list_corpus)))
print('y: {}'.format(len(list_labels)))

X: 15507
y: 15507


In [117]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [118]:
def fit_vectorizer(data, vec_type='count'):
    '''Create and fit a vectorizer
    
    Options:
    + count -> count_vectorizer 
    + tfidf -> tfidf_vectorizer
    
    Input:
    + data - X data to fit the model
    + vec_type - name of vectorizer to use
    
    Returns:
    + Document-term matrix or Tf-idf-weighted document-term matrix
    + vectorizer - fitted model
    '''
    if vec_type=='count':
        vectorizer = CountVectorizer()
    elif vec_type=='tfidf':
        vectorizer = TfidfVectorizer()
    else:
        print('Please select an appropriate option')
    
    emb = vectorizer.fit_transform(data)

    return emb, vectorizer

In [119]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

def get_metrics(y_test, y_predicted):  
    # true positives / (true positives+false positives)
    precision = precision_score(y_test, y_predicted, pos_label=None,
                                    average='weighted')             
    # true positives / (true positives + false negatives)
    recall = recall_score(y_test, y_predicted, pos_label=None,
                              average='weighted')
    
    # harmonic mean of precision and recall
    f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted')
    
    # true positives + true negatives/ total
    accuracy = accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [120]:
X_train, X_test, y_train, y_test = train_test_split(list_corpus, list_labels, test_size=0.2, 
                                                                                random_state=40)

# vectorize word conts
X_train_counts, count_vectorizer = fit_vectorizer(X_train, vec_type='count')
X_test_counts = count_vectorizer.transform(X_test)

In [123]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
# train & test logsitic regression model
#clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
#                         multi_class='ovr', n_jobs=-1, random_state=40)
#clf.fit(X_train_counts, y_train)

rfc = RandomForestClassifier()
rfc.fit(X_train_counts, y_train)

y_predicted = rfc.predict(X_test_counts)



NameError: name 'clf' is not defined

In [129]:
len(y_predicted)

3102

In [126]:
#y_predicted = rfc.predict(X_test_counts)

In [127]:
# check performance
accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted)
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and multilabel-indicator targets

In [138]:
df_nlp.shape

(15507, 2)

In [139]:
df_nlp_no_negative_class.shape

(15507, 2)