# Analysing Privacy Policies

In [1]:
import os
import pandas as pd

policy_dir = '../data/privacy_policies'
mhealth_apps = pd.read_csv('../data/mhealth_apps.csv')

policy_files = {os.path.splitext(file)[0] for file in os.listdir(policy_dir) if file.endswith('.txt')}
mhealth_apps['privacy_policy_file'] = mhealth_apps['app_id'].apply(lambda app_id: 'Yes' if str(app_id) in policy_files else 'No')

txt_file_count = len(policy_files)
print(f"Number of privacy policies extracted: {txt_file_count} ({txt_file_count/len(mhealth_apps['app_id'])*100:.2f}%)")

Number of privacy policies extracted: 691 (80.72%)


In [2]:
import os

policy_dir = '../data/privacy_policies'

# Error phrases to search for
error_phrases = [
    "Connection timed out",
    "Can't Find That Page",
    "Privacy error",
    "doesn't work properly without JavaScript enabled",
    "This site can’t be reached",
    "This document does not exist",
    "The request is blocked.",
    "AccessDenied",
    "no privacy policy"
]

error_policy = []
for filename in os.listdir(policy_dir):
    file_path = os.path.join(policy_dir, filename)
    
    if os.path.isfile(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                
                for phrase in error_phrases:
                    if phrase in content:
                        # print(f"{filename:<50}: {phrase}")
                        print(f"{phrase}")
                        app_id = os.path.splitext(filename)[0]  # Fixed here to use `filename`
                        error_policy.append(app_id)
        except Exception as e:
            # print(f"Error reading {filename}: {e}")
            pass

The request is blocked.
This site can’t be reached
Privacy error
AccessDenied
Can't Find That Page
Connection timed out
doesn't work properly without JavaScript enabled
This site can’t be reached
no privacy policy


In [3]:
apps_no_policy = mhealth_apps[mhealth_apps['privacy_policy_file'] == 'No']
apps_with_errors = mhealth_apps[mhealth_apps['app_id'].isin(error_policy)]
apps_no_policy = pd.concat([apps_no_policy, apps_with_errors])
print(f'Number of apps with no policy files: {len(apps_no_policy)}')


Number of apps with no policy files: 174


In [4]:
print(f"Number of apps 'is_privacy_policy' is 'Yes' but with no policy files:")
print(len(mhealth_apps[(mhealth_apps['privacy_policy_file'] == 'No') & (mhealth_apps['is_privacy_policy'] == 'Yes')]))


Number of apps 'is_privacy_policy' is 'Yes' but with no policy files:
165


In [5]:
app_data_shared = pd.read_csv('../data/app_data_shared.csv')
app_data_collected = pd.read_csv('../data/app_data_collected.csv')
app_security_practices = pd.read_csv('../data/app_security_practices.csv')

if 'data_shared' not in mhealth_apps.columns:
    mhealth_apps['data_shared'] = ""
if 'shared_purpose' not in mhealth_apps.columns:
    mhealth_apps['shared_purpose'] = ""
    
if 'data_collected' not in mhealth_apps.columns:
    mhealth_apps['data_collected'] = ""
if 'collected_purpose' not in mhealth_apps.columns:
    mhealth_apps['collected_purpose'] = ""
    
if 'security_practices' not in mhealth_apps.columns:
    mhealth_apps['security_practices'] = ""

grouped_shared = app_data_shared.groupby('app_id')['data_shared'].apply(lambda x: ','.join(x.astype(str)))
grouped_shared_purpose = app_data_shared.groupby('app_id')['shared_purpose'].apply(lambda x: ','.join(x.astype(str)))

grouped_collected = app_data_collected.groupby('app_id')['data_collected'].apply(lambda x: ','.join(x.astype(str)))
grouped_collected_purpose = app_data_collected.groupby('app_id')['collected_purpose'].apply(lambda x: ','.join(x.astype(str)))

grouped_security_practices = app_security_practices.groupby('app_id')['security_practices'].apply(lambda x: ','.join(x.astype(str)))

for idx, row in mhealth_apps.iterrows():
    app_id = row['app_id']
    if app_id in grouped_shared.index:
        mhealth_apps.at[idx, 'data_shared'] = grouped_shared[app_id]
        mhealth_apps.at[idx, 'shared_purpose'] = grouped_shared_purpose[app_id]
        
    if app_id in grouped_collected.index:
        mhealth_apps.at[idx, 'data_collected'] = grouped_collected.get(app_id, "")
        mhealth_apps.at[idx, 'collected_purpose'] = grouped_collected_purpose.get(app_id, "")
        
    if app_id in grouped_security_practices.index:
        mhealth_apps.at[idx, 'security_practices'] = grouped_security_practices.get(app_id, "")



mhealth_apps.to_csv('../data/updated_mhealth_apps.csv', index=False)

In [6]:
apps_no_policy_with_shared_data = mhealth_apps[
    (mhealth_apps['privacy_policy_file'] == 'No') & 
    (mhealth_apps['data_shared'].notnull()) & 
    (mhealth_apps['data_shared'].str.strip() != '')
]

apps_no_policy_with_collected_data = mhealth_apps[
    (mhealth_apps['privacy_policy_file'] == 'No') & 
    (mhealth_apps['data_collected'].notnull()) & 
    (mhealth_apps['data_collected'].str.strip() != '')
]

apps_no_policy_with_shared_and_collected_data = mhealth_apps[
    (mhealth_apps['privacy_policy_file'] == 'No') &
    (mhealth_apps['data_shared'].notnull()) & 
    (mhealth_apps['data_shared'].str.strip() != '') &
    (mhealth_apps['data_collected'].notnull()) & 
    (mhealth_apps['data_collected'].str.strip() != '')
]

print(f"Number of apps without a privacy policy but with shared data: {len(apps_no_policy_with_shared_data)}")
print('Average downloads:', round(apps_no_policy_with_shared_data['downloads'].astype(int).mean(), 2))

print(f"Number of apps without a privacy policy but with collected data: {len(apps_no_policy_with_collected_data)}")
print('Average downloads:', round(apps_no_policy_with_collected_data['downloads'].astype(int).mean(), 2))

print(f"Number of apps without a privacy policy but with both shared and collected data: {len(apps_no_policy_with_shared_and_collected_data)}")
print('Average downloads:', round(apps_no_policy_with_shared_and_collected_data['downloads'].astype(int).mean(), 2))


Number of apps without a privacy policy but with shared data: 33
Average downloads: 4129539.06
Number of apps without a privacy policy but with collected data: 3
Average downloads: 937374.33
Number of apps without a privacy policy but with both shared and collected data: 3
Average downloads: 937374.33


In [7]:
apps_by_category = apps_no_policy_with_shared_data['categories'].value_counts()
print(apps_by_category)

categories
Health & Fitness    19
Medical              7
Parenting            3
Education            2
Sports               1
Trivia               1
Name: count, dtype: int64


In [8]:
apps_no_policy_with_collected_data.head()
apps_by_category = apps_no_policy_with_collected_data['categories'].value_counts()
print('Average downloads:', round(apps_no_policy_with_collected_data[apps_no_policy_with_collected_data['categories'] == 'Health & Fitness']['downloads'].astype(int).mean(), 2))
print('Average downloads:', round(apps_no_policy_with_collected_data[apps_no_policy_with_collected_data['categories'] == 'Medical']['downloads'].astype(int).mean(), 2))

apps_by_category

Average downloads: 1167247.5
Average downloads: 477628.0


categories
Health & Fitness    2
Medical             1
Name: count, dtype: int64

In [9]:
apps_no_policy_with_shared_and_collected_data.head()
apps_by_category = apps_no_policy_with_shared_and_collected_data['categories'].value_counts()
print('Average downloads:', round(apps_no_policy_with_collected_data[apps_no_policy_with_collected_data['categories'] == 'Health & Fitness']['downloads'].astype(int).mean(), 2))
print('Average downloads:', round(apps_no_policy_with_collected_data[apps_no_policy_with_collected_data['categories'] == 'Medical']['downloads'].astype(int).mean(), 2))

apps_by_category

Average downloads: 1167247.5
Average downloads: 477628.0


categories
Health & Fitness    2
Medical             1
Name: count, dtype: int64