In [2]:
import os
import json
import string
import re
import textstat
import spacy
from src import common

## -- define

In [3]:
with open('../resources/easywords.txt', 'r') as easyfile:
    easywords = []
    for line in easyfile:
        easywords.append(line.strip())

with open('../resources/ambiguouswords.txt', 'r') as ambifile:
    ambiwords = []
    for line in ambifile:
        ambiwords.append(line.strip())

if os.path.isfile('../data/policies/urls/privacy_policy_urls_corrected.json'):
    with open('../data/policies/urls/privacy_policy_urls_corrected.json', 'r') as infile:
        ppurls = json.load(infile)
else:
    with open('../data/policies/urls/privacy_policy_urls_20210316.json', 'r') as infile:
        ppurls = json.load(infile)

In [5]:
def convert_to_sentences(text):
    """"""
    
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    document_sentences_tokens = []
    for s in doc.sents:
        sentence_tokens = []
        for t in s:
            token = t.text.translate(translator).lower()
            token_clean = re.sub("[^0-9a-zA-Z]+", "", token)
            if 0 < len(token_clean) < 25:
                sentence_tokens.append(token_clean)
        if len(sentence_tokens) > 0:
            document_sentences_tokens.append(sentence_tokens)
    
    return document_sentences_tokens

def convert_to_tokens(sentences):
    """"""
    
    tokens = [str(token) for s in sentences for token in s]

    difficult_tokens = []
    for t in tokens:
        syllable_count = count_token_syllables(t)
        if t not in easywords and syllable_count >= 2:
            difficult_tokens.append(t)
            
    return tokens, difficult_tokens

def count_token_syllables(word):
    """"""
    
    return textstat.syllable_count(word)

def statistics(sentences):
    """"""
    
    tokens, _ = convert_to_tokens(sentences)
    
    return {
        'n_sentences': len(sentences),
        'n_tokens': len(tokens),
        'n_unique_tokens': len(set(tokens))
    }

def gunning_fog_index(sentences):
    """"""
    
    tokens, difficult_tokens = convert_to_tokens(sentences)
    n_words = len(tokens)
    n_difficult_words = len(difficult_tokens)
    n_sentences = len(sentences)
    
    return round(0.4 * ((n_words / n_sentences) + 100 * (n_difficult_words / n_words)), 10)

def ambiquity_score(sentences, ambiwords):
    """"""
    
    tokens, _ = convert_to_tokens(sentences)
    
    ambi_tokens_count = len([x for x in tokens if x in ambiwords])
    ambi_score = ambi_tokens_count / len(tokens)
    
    return round(ambi_score, 10)

def run_text_analysis(text):
    """"""
    
    sentences = convert_to_sentences(policytext)
    results = statistics(sentences)
    results['gunning_fog'] = gunning_fog_index(sentences)
    results['ambiquity_score'] = ambiquity_score(sentences, ambiwords)
    
    return results

## -- execute

In [None]:
textstat.set_lang('en')

full_text_features = {}

verbose = False
count = 0
total_success = 0
total_error = 0
for k, v in ppurls.items():
    count += 1
    
    firmhash = common.__hash(k)
    if os.path.isfile(f'../data/policies/manual/{firmhash}_privacy_policy.txt'):
        with open(f'../data/policies/manual/{firmhash}_privacy_policy.txt', 'r') as infile:
            policytext = infile.read()
            go = True
    elif os.path.isfile(f'../data/policies/scraped/{firmhash}_privacy_policy.txt'):
        with open(f'../data/policies/scraped/{firmhash}_privacy_policy.txt', 'r') as infile:
            policytext = infile.read()
            go = True
    else:
        go = False
    
    if go:
        
        try:
            
            result = run_text_analysis(policytext)

            full_text_features[firmhash] = {
                'firm': k,
                'ppurl': v['ppurl'],
                'succeeded': True,
                'features': result
            }
            
            total_success += 1
            if verbose:
                print(f'{k} ==> done')
                print(f' - Number of sentences: {result["n_unique_tokens"]}')
                print(f' - Number of words: {result["n_tokens"]}')
                print(f' - Number of unique words: {result["n_unique_tokens"]}')
                print(f' - Ambiguous words: {round(result["ambiquity_score"] *100, 3)}%')
                print(f' - Fog readability score: {result["gunning_fog"]}')
            
        except ValueError as err:
            if verbose:
                print(f'{k} ==> ValueError: {err}')
            total_error += 1
            full_text_features[firmhash] = {
                'firm': k,
                'ppurl': v['ppurl'],
                'succeeded': False,
                'error': 'ValueError'
            }
            
        except ZeroDivisionError as err:
            if verbose:
                print(f'{k} ==> ZeroDivisionError: {err}')
            total_error += 1
            full_text_features[firmhash] = {
                'firm': k,
                'ppurl': v['ppurl'],
                'succeeded': False,
                'error': 'ZeroDivisionError'
            }
    
    else:
        
        full_text_features[firmhash] = {
                'firm': k,
                'ppurl': v['ppurl'],
                'succeeded': False,
                'error': 'NoFile'
            }
            

    with open('../data/policies/features/firm_pp_features_0.1.0.json', 'w') as outfile:
        json.dump(full_text_features, outfile)

Ally Financial ==> ZeroDivisionError: division by zero
Lithia Motors ==> ZeroDivisionError: division by zero
L Brands ==> ZeroDivisionError: division by zero
Ameriprise Financial ==> ZeroDivisionError: division by zero
Lincoln National ==> ZeroDivisionError: division by zero
EOG Resources ==> ZeroDivisionError: division by zero
Danaher ==> ZeroDivisionError: division by zero
Broadcom ==> ZeroDivisionError: division by zero
Dollar Tree ==> ZeroDivisionError: division by zero
Enterprise Products Partners ==> ZeroDivisionError: division by zero
Energy Transfer ==> ZeroDivisionError: division by zero


In [None]:
print(f'Total successes: {total_success}')
print(f'Total errors: {total_error}')

## -- checks + to pandas

In [None]:
import csv
import pandas as pd

full_text_features_checked = {k: v for k, v in full_text_features.items() if v['succeeded']}

df_features = pd.DataFrame({
    'firmhash': [k for k in full_text_features_checked.keys()],
    'firm': [v['firm'] for v in full_text_features_checked.values()],
    'n_sentence': [v['features']['n_sentences'] for v in full_text_features_checked.values()],
    'number_of_words': [v['features']['n_tokens'] for v in full_text_features_checked.values()],
    'number_of_unique_words': [v['features']['n_unique_tokens'] for v in full_text_features_checked.values()],
    'ambiquity_score': [v['features']['ambiquity_score'] for v in full_text_features_checked.values()],
    'gunning_fog_index': [v['features']['gunning_fog'] for v in full_text_features_checked.values()]
})

df_features.to_csv('../data/policies/features/firm_pp_features_0.2.0.csv',
                   index=False,
                   quotechar='"',
                   quoting=csv.QUOTE_NONNUMERIC)

In [None]:
{k: v for k, v in full_text_features.items() if not v['succeeded']}

In [None]:
{k: v for k, v in full_text_features_checked.items() if v['features']['n_tokens'] < 400}