In [None]:
import os
import json
import string
import textstat
# from readability import Readability
from src import common

In [None]:
# TODO: check manual folder, if not then check scraper folder
# TODO: remove dubble scape in cleaning  
# TODO: implement tries
# TODO: make custom fog index

## -- define

In [None]:
AMBI_WORDS = ['Occasional', 'will', 'perhaps', 'such', 'some', 'certain', 'various', 'reasonable', 'like', 'example', 'sometimes', 'depending', 'necessary', 'appropriate', 'inappropriate',
'generally', 'mostly','widely', 'general', 'commonly', 'usually', 'normally', 'typically', 'largely', 'often', 'may', 'might', 'can', 'could', 'would', 'likely', 'possible', 'possibly',
'unsure', 'anyone', 'certain', 'everyone', 'numerous', 'some', 'most', 'few', 'much', 'many', 'various']

In [None]:
if os.path.isfile('../data/policies/urls/privacy_policy_urls_corrected.json'):
    with open('../data/policies/urls/privacy_policy_urls_corrected.json', 'r') as infile:
        ppurls = json.load(infile)
else:
    with open('../data/policies/urls/privacy_policy_urls_20210316.json', 'r') as infile:
        ppurls = json.load(infile)

In [None]:
def clean_text(text):
    """"""
    
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text_lowered = text.lower()
    text_cleaned = text_lowered.translate(translator)
    
    return text_cleaned

def score_readability(text, score_methods):
    """"""
    
#     r = Readability(text)
    
    scores = {}
    for m in score_methods:
#         scores[m] = r.smog()
        scores[m] = textstat.gunning_fog(text)
        
    
    return scores

def score_tokens(text):
    """"""
    
    tokens = [t for t in text.split(' ') if len(t) > 1]
    unique_tokens = set(tokens)
    ambi_tokens_count = len([x for x in tokens if x in AMBI_WORDS])
    score = ambi_tokens_count / len(tokens)
    
    return {
        'n_tokens': len(tokens),
        'n_unique_tokens': len(unique_tokens),
        'ambiquity_score': score
    }

def run_text_analysis(text, score_methods):
    """"""
    
    text_clean = clean_text(text)    
    
    token_result = score_tokens(text_clean)
    readability_result = score_readability(text, score_methods)
    
    result = {**token_result, **readability_result}
    result['n_sentence'] = textstat.sentence_count(text)
    
    return result
    

## -- execute

In [None]:
textstat.set_lang('en')

full_text_features = {}

verbose = False
count = 0
total_success = 0
total_error = 0
for k, v in ppurls.items():
    count += 1
    
    firmhash = common.__hash(k)
    if os.path.isfile(f'../data/policies/scraped/{firmhash}_privacy_policy.txt'):
        with open(f'../data/policies/scraped/{firmhash}_privacy_policy.txt', 'r') as infile:
            policytext = infile.read()
            go = True
    
    else:
        go = False
    
    if go:
        
        try:
            
            result = run_text_analysis(policytext, score_methods=['gunning_fog'])

            full_text_features[firmhash] = {
                'firm': k,
                'ppurl': v['ppurl'],
                'features': result
            }
            
            total_success += 1
            
            print(f'{k} ==> done')
            if verbose:
                print(f' - Number of words: {result["n_tokens"]}')
                print(f' - Number of unique words: {result["n_unique_tokens"]}')
                print(f' - Ambiguous words: {round(result["ambiquity_score"] *100, 3)}%')
                print(f' - Number of sentences: {result["n_unique_tokens"]}')
                print(f' - Fog readability score: {result["gunning_fog"]}')
            
        except ValueError as err:
            print(f'{k} ==> ValueError: {err}')
            total_error += 1
            
        except ZeroDivisionError as err:
            print(f'{k} ==> ZeroDivisionError: {err}')
            total_error += 1

    with open('../data/policies/features/firm_pp_features_0.1.0.json', 'w') as outfile:
        json.dump(full_text_features, outfile)

In [None]:
print(f'Total successes: {total_success}')
print(f'Total errors: {total_error}')

## -- checks and to pandas

In [None]:
import csv
import pandas as pd

In [None]:
df_features = pd.DataFrame({
    'firmhash': [k for k in full_text_features.keys()],
    'firm': [v['firm'] for v in full_text_features.values()],
    'number_of_words': [v['features']['n_tokens'] for v in full_text_features.values()],
    'number_of_unique_words': [v['features']['n_unique_tokens'] for v in full_text_features.values()],
    'n_sentence': [v['features']['n_sentence'] for v in full_text_features.values()],
    'ambiquity_score': [v['features']['ambiquity_score'] for v in full_text_features.values()],
    'gunning_fog_score': [v['features']['gunning_fog'] for v in full_text_features.values()]
})

In [None]:
df_features.to_csv('../data/policies/features/firm_pp_features_0.1.0.csv', index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)