# Analysing mHealth Apps compliance with privacy laws or regulations

In [1]:
import pandas as pd

merged_df = pd.read_csv('../data/merged_df.csv')
all_answers = pd.read_csv('../data/RQ_answers.csv')

mhealth_apps_final = pd.merge(merged_df, all_answers, on='app_id', how='left')
mhealth_apps_final['a12'] = mhealth_apps_final['a12'].fillna('Omitted') # apps lacking privacy policy

mhealth_apps_final.to_csv('../data/mhealth_apps_final.csv', index=False)

# Q1: Does the privacy policy specify the types of data collected?

In [2]:
app_data_collected = pd.read_csv('../data/app_data_collected.csv')

print('Number of Apps mentioning collecting data in Data Safety:', len(app_data_collected['app_id'].unique()))

print('\nNumber of Apps mentioning collecting data in Privacy Policy:')
a1_data_collected = mhealth_apps_final['a1'].value_counts()
print(a1_data_collected)

print('\nVariances:', len(app_data_collected['app_id'].unique()) - len(mhealth_apps_final[mhealth_apps_final['a1'] == 'Specified']))

Number of Apps mentioning collecting data in Data Safety: 695

Number of Apps mentioning collecting data in Privacy Policy:
a1
Specified        756
Not mentioned     88
Name: count, dtype: int64

Variances: -61


# Q2: Does the policy explain data collection practices for all, partial, or none of the collected data?

In [3]:
print('\nNumber of Apps explaining data collection:')
a2_collection_purpose = mhealth_apps_final['a2'].value_counts()
a2_collection_purpose


Number of Apps explaining data collection:


a2
Partial           775
Not applicable     55
All                14
Name: count, dtype: int64

# Q3: Does the privacy policy specify the types of data shared with third parties?

In [4]:
app_data_shared = pd.read_csv('../data/app_data_shared.csv')
print('Number of Apps mentioning sharing data in Data Safety:', len(app_data_shared['app_id'].unique()))

print('\nNumber of Apps mentioning sharing data in Privacy Policy:')
a3_data_shared = mhealth_apps_final['a3'].value_counts()
print(a3_data_shared)

print('\nVariances:', len(app_data_shared['app_id'].unique()) - len(mhealth_apps_final[mhealth_apps_final['a3'] == 'Specified']))

Number of Apps mentioning sharing data in Data Safety: 373

Number of Apps mentioning sharing data in Privacy Policy:
a3
Specified        642
Not mentioned    202
Name: count, dtype: int64

Variances: -269


# Q4: Does the policy explain data sharing practices for all, partial, or none of the shared data?

In [5]:
print('\nNumber of Apps explaining data sharing:')
a4_sharing_purpose = mhealth_apps_final['a4'].value_counts()
a4_sharing_purpose


Number of Apps explaining data sharing:


a4
Partial           648
Not applicable    194
All                 2
Name: count, dtype: int64

# Q5: Does the policy explain who is receiving the shared data?

In [6]:
print('\nNumber of Apps explaining receipients of sharing data:')
a5_sharing_recipients = mhealth_apps_final['a5'].value_counts()
print(a5_sharing_recipients)

print('\nNumber of Apps sharing data but failed to explain recipients of shared data:')
print(len(mhealth_apps_final[(mhealth_apps_final['a3'] == 'Specified') & (mhealth_apps_final['a5'] == 'No')]))


Number of Apps explaining receipients of sharing data:
a5
Yes               615
Not applicable    178
No                 47
Partial             4
Name: count, dtype: int64

Number of Apps sharing data but failed to explain recipients of shared data:
30


# Q6: Does the policy explain users’ rights to access, correct, delete, or opt out of marketing communications?

In [7]:
a6_user_rights = mhealth_apps_final['a6'].value_counts()
a6_user_rights

a6
Partial       630
All           115
Not at all     99
Name: count, dtype: int64

# Q7: Does the policy mention if the data is encrypted in transit?

In [8]:
app_security_practices = pd.read_csv('../data/app_security_practices.csv')

print('Number of Apps mentioning data encrypted in Data Safety:', len(app_security_practices[app_security_practices['security_practices'] == 'Data is encrypted in transit']))

a7_encrypted = mhealth_apps_final['a7'].value_counts()
a7_encrypted

Number of Apps mentioning data encrypted in Data Safety: 698


a7
Not mentioned    599
Encrypted        243
Not encrypted      1
Partial            1
Name: count, dtype: int64

# Q8: Does the policy outline security measures against unauthorized access or breaches?

In [9]:
a8_security_measures = mhealth_apps_final['a8'].value_counts()
a8_security_measures

a8
Brief            705
Not mentioned     93
Detailed          46
Name: count, dtype: int64

# Q9: Is contact information provided for questions or complaints? 

In [10]:
print('\nNumber of Apps provide information for questions or complaints:')
a9_contract_info = mhealth_apps_final['a9'].value_counts()
a9_contract_info


Number of Apps provide information for questions or complaints:


a9
Yes    798
No      46
Name: count, dtype: int64

# Q10: Does the policy address children’s data and parental consent? 

In [11]:
a10_children = mhealth_apps_final['a10'].value_counts()
a10_children

a10
Brief            405
Not mentioned    328
Detailed         111
Name: count, dtype: int64

# Q11: Does the policy explain the compliance with privacy laws?

In [12]:
import pandas as pd

usa = ['us']

# source: https://www.gdpradvisor.co.uk/gdpr-countries
GDPR_countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Estonia', 'Finland', 'France', 
                  'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 
                  'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'United Kingdom']

country_list = pd.read_csv("../data/country_list.csv")

europe = []

for country in GDPR_countries:
    matching_countries = country_list[country_list['country_name'].str.contains(country, na=False)]
    europe.extend(matching_countries['country_code'].tolist())

print(len(europe)-1)
print(europe)

28
['at', 'be', 'bg', 'hr', 'cy', 'cz', 'dk', 'ee', 'fi', 'fr', 'de', 'gr', 'hu', 'ie', 'it', 'lv', 'lt', 'lu', 'mt', 'nl', 'pl', 'pt', 'ro', 'sk', 'si', 'es', 'se', 'uk', 'gb']


In [13]:
def filter_data(df):

    def country_filter(countries, country_set):
        return any(country.lower() in country_set for country in countries)

    countries_list = df['countries'].str.split(', ').explode()
    country_counts = countries_list.value_counts()
    top_5_countries = country_counts.head(5)
    print("Top 5 countries:")
    print(top_5_countries)
    
    usa_europe = set(usa + europe)
    rest_of_world = set(countries_list) - usa_europe
    
    df = df.copy()
    df['country_split'] = df['countries'].str.split(', ')

    # Apply filters
    us_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, usa))]
    europe_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, europe))]
    row_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, rest_of_world))]

    print('Number of apps available in USA:', len(us_filtered_df))
    print('Number of apps available in Europe:', len(europe_filtered_df))
    print('Number of apps available in RoW:', len(row_filtered_df))
    
    return us_filtered_df, europe_filtered_df, row_filtered_df

# Inconsistency Assessment

In [14]:
# Apps claim to have a privacy policy but fail to provide one
total_inconsist_apps = mhealth_apps_final[
    (mhealth_apps_final['is_privacy_policy'] != mhealth_apps_final['privacy_policy_file']) |
    ((mhealth_apps_final['a1'] == 'Not mentioned') & mhealth_apps_final['data_collected'].notnull()) |
    ((mhealth_apps_final['a3'] == 'Not mentioned') & mhealth_apps_final['data_shared'].notnull()) |
    ((mhealth_apps_final['security_practices'].fillna('').str.contains('Data is encrypted')) & (mhealth_apps_final['a7'] != 'Encrypted')) |
    ((mhealth_apps_final['data_shared'].isnull() | mhealth_apps_final['data_collected'].isnull()) & 
     (mhealth_apps_final['PII'].notnull() | 
      mhealth_apps_final['PHI'].notnull() | 
      mhealth_apps_final['PII&PHI'].notnull()))
]

print(f'Apps contain inconsistency: {len(total_inconsist_apps)} ({len(total_inconsist_apps)/len(mhealth_apps_final)*100:.1f}%)')

Apps contain inconsistency: 665 (77.7%)


In [15]:
import numpy as np
import pandas as pd

relevant_data = set(
    item.strip().lower()
    for col in ['data_shared', 'data_collected', 'a1_note', 'a3_note']
    for sublist in mhealth_apps_final[col].dropna().astype(str).str.split(',')
    for item in sublist
)

# Check for inconsistencies with case-insensitive comparison
inconsistent_rows = mhealth_apps_final[
    mhealth_apps_final['PII'].apply(
        lambda pii: pd.notna(pii) and any(
            pii_item.strip().lower() not in relevant_data 
            for pii_item in pii.split(',')
        )
    )
]

inconsistent_apps = inconsistent_rows
print('Number of apps collecting/sharing PII that is not mentioned in their Data Safety section or privacy policies')
print('PII Inconsistency:', len(inconsistent_apps))

Number of apps collecting/sharing PII that is not mentioned in their Data Safety section or privacy policies
PII Inconsistency: 52


In [16]:
using_other_ids = inconsistent_apps[
    (inconsistent_apps['data_shared'].str.contains("other IDs", case=False, na=False) |
     inconsistent_apps['data_collected'].str.contains("other IDs", case=False, na=False) |
     inconsistent_apps['a1_note'].str.contains("other IDs", case=False, na=False) |
     inconsistent_apps['a3_note'].str.contains("other IDs", case=False, na=False))
    &
    (inconsistent_apps['PII'].str.contains("aaid|aid|imsi|mac", case=False, na=False))
]
print(len(using_other_ids))

36


In [17]:
inconsistent_apps = inconsistent_apps.merge(using_other_ids, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
print(len(inconsistent_apps))

16


In [18]:
cdc_apps = inconsistent_apps[inconsistent_apps['app_id'].str.contains('cdc', na=False)]
print('Number of CDC apps:', len(cdc_apps))

Number of CDC apps: 0


In [19]:
category_counts = inconsistent_apps.groupby('categories').size().reset_index(name='total_count')
category_counts = category_counts.sort_values(by='total_count', ascending=False).reset_index(drop=True)

category_counts

Unnamed: 0,categories,total_count
0,Health & Fitness,10
1,Medical,3
2,Books & Reference,1
3,Education,1
4,Lifestyle,1


In [20]:
us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(inconsistent_apps)
print('Average downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('Average downloads in RoW:', row_filtered_df['downloads'].mean())


Top 5 countries:
countries
uk    10
gb     9
au     9
bs     8
am     8
Name: count, dtype: int64
Number of apps available in USA: 8
Number of apps available in Europe: 10
Number of apps available in RoW: 15
Average downloads in USA: 6490670.375
Average downloads in Europe: 5222438.2
Average downloads in RoW: 3597793.2


# Noncompliance Assessment

In [21]:
import re

def extract_unique_item(dataframe):
    unique_items = set()
    for lists in dataframe.dropna():
        item_list = lists.split(',')
        for items in item_list:
            items = items.split(',')
        unique_items.update(item.strip() for item in items if item)
    return unique_items

unique_laws = extract_unique_item(mhealth_apps_final['a11'])
print('Total unique laws:', len(unique_laws))
unique_laws


Total unique laws: 108


{'152-FZ',
 'AO',
 'APEC CBPR',
 'APEC Cross Border Privacy Rules System',
 'Act on the Protection of Personal Information',
 'Australian Privacy Act 1988',
 'BDSG',
 'CAN-SPAM',
 'CAN-SPAM Act',
 'CCPA',
 'CCPA (mentioned but not subject to)',
 'CCPA/CPRA',
 'COPPA',
 'CPA',
 'CPRA',
 'CalOPPA',
 'California Civil Code Section 1798.83',
 'California Privacy Rights Act',
 'Consumer Protection Act 2019',
 'Consumer Protection Regulations',
 'DPA',
 'DPDPA',
 'DSGVO',
 'Data Privacy Act of 2012 of the Republic of the Philippines',
 'Data Protection Act',
 'Data Protection Act 2018',
 'Data Protection Act 2019',
 'Digital Personal Data Protection Act 2023',
 'EU GDPR',
 'EU regulations',
 'EU-DSGVO',
 'EU-US Privacy Shield',
 'Electronic Financial Transaction Act',
 'FDPA',
 'FERPA',
 'Federal Law on the Protection of Personal Data Held by Private Parties',
 'Federal Law on the Protection of Personal Data held by Private Parties in Mexico',
 'French Data Protection Act (Law no 78-17 of 6 

In [22]:
no_compliance = mhealth_apps_final[mhealth_apps_final['a11'] == 'Not mentioned']
print("Apps do not mentioned laws or regulations:", len(no_compliance))
no_compliance_count = no_compliance.groupby('categories').size()
no_compliance_count

Apps do not mentioned laws or regulations: 367


categories
Beauty                 1
Books & Reference      1
Business               2
Education             16
Food & Drink           3
Health & Fitness     213
Lifestyle             14
Medical               91
Music & Audio          1
Parenting             12
Productivity           5
Shopping               3
Simulation             2
Sports                 1
Tools                  2
dtype: int64

In [23]:
us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(no_compliance)
filtered_no_compliance = pd.concat([us_filtered_df, europe_filtered_df], ignore_index=True)
print('Average downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('Average downloads in RoW:', row_filtered_df['downloads'].mean())

Top 5 countries:
countries
cf    184
ls    182
sz    179
sd    178
mm    178
Name: count, dtype: int64
Number of apps available in USA: 134
Number of apps available in Europe: 150
Number of apps available in RoW: 362
Average downloads in USA: 19749363.074626867
Average downloads in Europe: 18926558.006666668
Average downloads in RoW: 9335681.26519337


In [24]:
noncompliance_apps = filtered_no_compliance[
    filtered_no_compliance[['data_shared', 'data_collected', 'PII', 'PHI', 'PII&PHI']].isnull().all(axis=1)
]

In [25]:
print("Apps with no compliance:\n")
apps_sharing = filtered_no_compliance[filtered_no_compliance['data_shared'].notnull()]
print('Sharing data:', len(apps_sharing))

apps_collecting = filtered_no_compliance[filtered_no_compliance['data_collected'].notnull()]
print('Collecting data:', len(apps_collecting))

apps_pii = filtered_no_compliance[filtered_no_compliance['PII'].notnull()]
print('Collecting/sharing PII:', len(apps_pii))

apps_phi = filtered_no_compliance[filtered_no_compliance['PHI'].notnull()]
print('Collecting/sharing PHI:', len(apps_phi))

apps_pii_phi = filtered_no_compliance[filtered_no_compliance['PII&PHI'].notnull()]
print('Collecting/sharing PII&PHI:', len(apps_pii_phi))

Apps with no compliance:

Sharing data: 114
Collecting data: 223
Collecting/sharing PII: 41
Collecting/sharing PHI: 28
Collecting/sharing PII&PHI: 198


In [26]:
law_mentioned_apps = mhealth_apps_final[mhealth_apps_final['a11'] != 'Not mentioned']
print(f'Number of apps mentioning laws in their privacy policies: {len(law_mentioned_apps)}\n')

us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(law_mentioned_apps)
law_mentioned_apps = pd.concat([us_filtered_df, europe_filtered_df], ignore_index=True)
print('Average downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('Average downloads in RoW:', row_filtered_df['downloads'].mean())

Number of apps mentioning laws in their privacy policies: 489

Top 5 countries:
countries
ca    244
uk    240
th    239
il    234
ch    233
Name: count, dtype: int64
Number of apps available in USA: 226
Number of apps available in Europe: 240
Number of apps available in RoW: 472
Average downloads in USA: 16938676.03539823
Average downloads in Europe: 45203543.925
Average downloads in RoW: 33254273.129237287


In [27]:
unique_laws_us = extract_unique_item(us_filtered_df['a11'])
print('Total unique laws:', len(unique_laws_us))

no_HIPAA = us_filtered_df[~us_filtered_df['a11'].str.contains('HIPAA', case=False, na=False)]
print(f'Apps that do not mention HIPAA: {len(no_HIPAA)} ({len(no_HIPAA) / len(us_filtered_df) * 100:.2f}%)')


Total unique laws: 44
Apps that do not mention HIPAA: 190 (84.07%)


In [28]:
print("Apps accessible in USA:\n")

apps_sharing = no_HIPAA[no_HIPAA['data_shared'].notnull()]
print('Sharing data:', len(apps_sharing))

apps_collecting = no_HIPAA[no_HIPAA['data_collected'].notnull()]
print('Collecting data:', len(apps_collecting))

apps_pii = no_HIPAA[no_HIPAA['PII'].notnull()]
print('Collecting/sharing PII:', len(apps_pii))

apps_phi = no_HIPAA[no_HIPAA['PHI'].notnull()]
print('Collecting/sharing PHI:', len(apps_phi))

apps_pii_phi = no_HIPAA[no_HIPAA['PII&PHI'].notnull()]
print('Collecting/sharing PII&PHI:', len(apps_pii_phi))

Apps accessible in USA:

Sharing data: 93
Collecting data: 168
Collecting/sharing PII: 36
Collecting/sharing PHI: 12
Collecting/sharing PII&PHI: 135


In [29]:
unique_laws_europe = extract_unique_item(europe_filtered_df['a11'])
print('Total unique laws:', len(unique_laws_europe))
no_GDPR = europe_filtered_df[~europe_filtered_df['a11'].str.contains('GDPR', case=False, na=False)]
print(f'Apps that do not mention GDPR: {len(no_GDPR)} ({len(no_GDPR) / len(europe_filtered_df) * 100:.2f}%)')


Total unique laws: 49
Apps that do not mention GDPR: 63 (26.25%)


In [30]:
print("Apps accessible in Europe:\n")

apps_sharing = no_GDPR[no_GDPR['data_shared'].notnull()]
print('Sharing data:', len(apps_sharing))

apps_collecting = no_GDPR[no_GDPR['data_collected'].notnull()]
print('Collecting data:', len(apps_collecting))

apps_pii = no_GDPR[no_GDPR['PII'].notnull()]
print('Collecting/sharing PII:', len(apps_pii))

apps_phi = no_GDPR[no_GDPR['PHI'].notnull()]
print('Collecting/sharing PHI:', len(apps_phi))

apps_pii_phi = no_GDPR[no_GDPR['PII&PHI'].notnull()]
print('Collecting/sharing PII&PHI:', len(apps_pii_phi))

Apps accessible in Europe:

Sharing data: 30
Collecting data: 55
Collecting/sharing PII: 15
Collecting/sharing PHI: 2
Collecting/sharing PII&PHI: 39


# COPPA Compliance Assessment

In [31]:
irrelevant_data = ['Teen', 'Mature 17+', 'Rated 12+', 'Rated for 12+', 'Rated for 18+']
apps_coppa = mhealth_apps_final[
    mhealth_apps_final['content_rating'].notna() & 
    ~mhealth_apps_final['content_rating'].isin(irrelevant_data)
]
print('Number of apps under COPPA:', len(apps_coppa))

Number of apps under COPPA: 798


In [32]:
a10_children = apps_coppa['a10'].value_counts()

print('Average downloads of Detailed:', apps_coppa[apps_coppa['a10'] == 'Detailed']['downloads'].mean())
print('Average downloads of Brief:', apps_coppa[apps_coppa['a10'] == 'Brief']['downloads'].mean())
print('Average downloads of Not mentioned:', apps_coppa[apps_coppa['a10'] == 'Not mentioned']['downloads'].mean())

a10_children

Average downloads of Detailed: 14519967.282828283
Average downloads of Brief: 37507605.5026178
Average downloads of Not mentioned: 9926064.360655738


a10
Brief            382
Not mentioned    305
Detailed          99
Name: count, dtype: int64

In [33]:
no_coppa_apps = apps_coppa[
    (apps_coppa['a10'] == 'Not mentioned') & 
    (
        (apps_coppa['data_shared'].notnull()) | 
        (apps_coppa['data_collected'].notnull()) | 
        (apps_coppa['a1'] == 'Yes') | 
        (apps_coppa['a3'] == 'Yes') | 
        (apps_coppa['PII'].notnull()) | 
        (apps_coppa['PHI'].notnull()) | 
        (apps_coppa['PII&PHI'].notnull())
    )
]

print('Number of apps sharing/collecting data but failed to address COPPA:', len(no_coppa_apps))
us_filtered_df, europe_filtered_df, row_filtered_df= filter_data(no_coppa_apps)
print('\nAverage downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('\nAverage downloads in RoW:', row_filtered_df['downloads'].mean())


Number of apps sharing/collecting data but failed to address COPPA: 275
Top 5 countries:
countries
th    111
vn    110
br    110
ua    109
md    108
Name: count, dtype: int64
Number of apps available in USA: 93
Number of apps available in Europe: 106
Number of apps available in RoW: 266

Average downloads in USA: 27553224.268817205
Average downloads in Europe: 24197671.273584906

Average downloads in RoW: 11286384.29699248


In [34]:
no_coppa_apps_category = no_coppa_apps['categories'].value_counts()
no_coppa_apps_category

categories
Health & Fitness     174
Medical               57
Parenting             13
Lifestyle              9
Education              8
Business               3
Shopping               3
Books & Reference      1
Productivity           1
Food & Drink           1
Beauty                 1
Simulation             1
Communication          1
Tools                  1
Sports                 1
Name: count, dtype: int64

# Q12: How would you rate the policy disclosure?

In [35]:
print('\nRate of the policy disclosure:')
a12_disclosure = mhealth_apps_final['a12'].value_counts()
a12_disclosure


Rate of the policy disclosure:


a12
Vague      604
Clear      220
Omitted     32
Name: count, dtype: int64