# Analysing mHealth Apps compliance with privacy laws or regulations

In [1]:
import pandas as pd

merged_df = pd.read_csv('../data/merged_df.csv')
all_answers = pd.read_csv('../data/all_answers.csv')

mhealth_apps_final = pd.merge(merged_df, all_answers, on='app_id', how='left')
mhealth_apps_final['a12'] = mhealth_apps_final['a12'].fillna('Omitted') # apps lacking privacy policy

mhealth_apps_final.to_csv('../data/mhealth_apps_final.csv', index=False)

# Q1: Does the privacy policy specify the types of data collected?

In [2]:
app_data_collected = pd.read_csv('../data/app_data_collected.csv')

print('Number of Apps mentioning collecting data in Data Safety:', len(app_data_collected['app_id'].unique()))

print('\nNumber of Apps mentioning collecting data in Privacy Policy:')
a1_data_collected = mhealth_apps_final['a1'].value_counts()
print(a1_data_collected)

print('\nVariances:', len(app_data_collected['app_id'].unique()) - len(mhealth_apps_final[mhealth_apps_final['a1'] == 'Yes']))

Number of Apps mentioning collecting data in Data Safety: 695

Number of Apps mentioning collecting data in Privacy Policy:
a1
Yes    649
No      42
Name: count, dtype: int64

Variances: 46


# Q2: Does the policy explain data collection practices for all, partial, or none of the collected data?

In [3]:
print('\nNumber of Apps explaining data collection:')
a2_collection_purpose = mhealth_apps_final['a2'].value_counts()
a2_collection_purpose


Number of Apps explaining data collection:


a2
Partial       665
Not at all     25
All             1
Name: count, dtype: int64

# Q3: Does the privacy policy specify the types of data shared with third parties?

In [4]:
app_data_shared = pd.read_csv('../data/app_data_shared.csv')
print('Number of Apps mentioning sharing data in Data Safety:', len(app_data_shared['app_id'].unique()))

print('\nNumber of Apps mentioning sharing data in Privacy Policy:')
a3_data_shared = mhealth_apps_final['a3'].value_counts()
print(a3_data_shared)

print('\nVariances:', len(app_data_shared['app_id'].unique()) - len(mhealth_apps_final[mhealth_apps_final['a3'] == 'Yes']))

Number of Apps mentioning sharing data in Data Safety: 373

Number of Apps mentioning sharing data in Privacy Policy:
a3
Yes    591
No     100
Name: count, dtype: int64

Variances: -218


# Q4: Does the policy explain data sharing practices for all, partial, or none of the shared data?

In [5]:
print('\nNumber of Apps explaining data sharing:')
a4_sharing_purpose = mhealth_apps_final['a4'].value_counts()
a4_sharing_purpose


Number of Apps explaining data sharing:


a4
Partial       589
Not at all    101
All             1
Name: count, dtype: int64

# Q5: Does the policy explain who is receiving the shared data?

In [6]:
print('\nNumber of Apps explaining receipients of sharing data:')
a5_sharing_recipients = mhealth_apps_final['a5'].value_counts()
a5_sharing_recipients


Number of Apps explaining receipients of sharing data:


a5
Yes        573
No         113
Partial      5
Name: count, dtype: int64

# Q6: Does the policy explain users’ rights to access, correct, delete, or opt out of marketing communications?

In [7]:
a6_user_rights = mhealth_apps_final['a6'].value_counts()
a6_user_rights

a6
Partial       410
All           211
Not at all     70
Name: count, dtype: int64

# Q7: Does the policy mention if the data is encrypted in transit?

In [8]:
app_security_practices = pd.read_csv('../data/app_security_practices.csv')

print('Number of Apps mentioning data encrypted in Data Safety:', len(app_security_practices[app_security_practices['security_practices'] == 'Data is encrypted in transit']))

a7_encrypted = mhealth_apps_final['a7'].value_counts()
a7_encrypted

Number of Apps mentioning data encrypted in Data Safety: 698


a7
Not mentioned    470
Encrypted        219
Not encrypted      2
Name: count, dtype: int64

# Q8: Does the policy outline security measures against unauthorized access or breaches?

In [9]:
a8_security_measures = mhealth_apps_final['a8'].value_counts()
a8_security_measures

a8
Brief            564
Not mentioned     88
Detailed          39
Name: count, dtype: int64

# Q9: Is contact information provided for questions or complaints? 

In [10]:
print('\nNumber of Apps provide information for questions or complaints:')
a9_contract_info = mhealth_apps_final['a9'].value_counts()
a9_contract_info


Number of Apps provide information for questions or complaints:


a9
Yes    659
No      32
Name: count, dtype: int64

# Q10: Does the policy address children’s data and parental consent? 

In [11]:
a10_children = mhealth_apps_final['a10'].value_counts()
a10_children

a10
Brief            320
Not mentioned    285
Detailed          86
Name: count, dtype: int64

# Q11: Does the policy explain the compliance with privacy laws?

In [22]:
def filter_data(df):
    usa = ['us']
    europe = [
        'al', 'ad', 'am', 'at', 'az', 'by', 'be', 'ba', 'bg', 'hr', 'cy', 'cz', 'dk', 'ee',
        'fi', 'fr', 'ge', 'de', 'gi', 'gr', 'hu', 'is', 'ie', 'it', 'kz', 'xk', 'lv', 'li',
        'lt', 'lu', 'mt', 'md', 'mc', 'me', 'nl', 'mk', 'no', 'pl', 'pt', 'ro', 'ru', 'sm',
        'rs', 'sk', 'si', 'es', 'se', 'ch', 'ua', 'gb', 'va', 'uk'
    ]

    def country_filter(countries, country_set):
        return any(country.lower() in country_set for country in countries)

    countries_list = df['countries'].str.split(', ').explode()
    country_counts = countries_list.value_counts()
    top_5_countries = country_counts.head(5)
    print("Top 5 countries:")
    print(top_5_countries)
    
    usa_europe = set(usa + europe)
    rest_of_world = set(countries_list) - usa_europe
    
    df = df.copy()
    df['country_split'] = df['countries'].str.split(', ')

    # Apply filters
    us_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, usa))]
    europe_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, europe))]
    row_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, rest_of_world))]

    print('Number of apps available in USA:', len(us_filtered_df))
    print('Number of apps available in Europe:', len(europe_filtered_df))
    print('Number of apps available in RoW:', len(row_filtered_df))
    
    return us_filtered_df, europe_filtered_df, row_filtered_df

# Inconsistency Assessment

In [23]:
import numpy as np
import pandas as pd

relevant_data = set(
    item.strip().lower()
    for col in ['data_shared', 'data_collected', 'a1_note', 'a3_note']
    for sublist in mhealth_apps_final[col].dropna().astype(str).str.split(',')
    for item in sublist
)

# Check for inconsistencies with case-insensitive comparison
inconsistent_rows = mhealth_apps_final[
    mhealth_apps_final['PII'].apply(
        lambda pii: pd.notna(pii) and any(
            pii_item.strip().lower() not in relevant_data 
            for pii_item in pii.split(',')
        )
    )
]

inconsistent_apps = inconsistent_rows
print('PII Inconsistency:', len(inconsistent_apps))

PII Inconsistency: 52


In [24]:
using_other_ids = inconsistent_apps[
    (inconsistent_apps['data_shared'].str.contains("other IDs", case=False, na=False) |
     inconsistent_apps['data_collected'].str.contains("other IDs", case=False, na=False) |
     inconsistent_apps['a1_note'].str.contains("other IDs", case=False, na=False) |
     inconsistent_apps['a3_note'].str.contains("other IDs", case=False, na=False))
    &
    (inconsistent_apps['PII'].str.contains("aaid|aid|imsi|mac", case=False, na=False))
]
print(len(using_other_ids))

36


In [25]:
inconsistent_apps = inconsistent_apps.merge(using_other_ids, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
print(len(inconsistent_apps))

16


In [26]:
cdc_apps = inconsistent_apps[inconsistent_apps['app_id'].str.contains('cdc', na=False)]
print('Number of CDC apps:', len(cdc_apps))

Number of CDC apps: 0


In [27]:
category_counts = inconsistent_apps.groupby('categories').size().reset_index(name='total_count')
category_counts = category_counts.sort_values(by='total_count', ascending=False).reset_index(drop=True)

category_counts

Unnamed: 0,categories,total_count
0,Health & Fitness,10
1,Medical,3
2,Books & Reference,1
3,Education,1
4,Lifestyle,1


In [28]:
us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(inconsistent_apps)
print('Average downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('Average downloads in RoW:', row_filtered_df['downloads'].mean())


Top 5 countries:
countries
uk    10
gb     9
au     9
bs     8
am     8
Name: count, dtype: int64
Number of apps available in USA: 8
Number of apps available in Europe: 14
Number of apps available in RoW: 15
Average downloads in USA: 6490670.375
Average downloads in Europe: 3853653.1428571427
Average downloads in RoW: 3597793.2


# Noncompliance Assessment

In [29]:
import re

def extract_unique_item(dataframe):
    unique_items = set()
    for lists in dataframe.dropna():
        item_list = lists.split(',')
        for items in item_list:
            items = items.split(',')
        unique_items.update(item.strip() for item in items if item)
    return unique_items

unique_laws = extract_unique_item(mhealth_apps_final['a11'])
print('Total unique laws:', len(unique_laws))
unique_laws


Total unique laws: 90


{'"On Personal Data"No. ZRU-547',
 '152-FZ',
 'AO',
 'Act CVIII of 2001',
 'Act on the Protection of Personal Information',
 'BDSG',
 'CAN-SPAM',
 'CCPA',
 'CCPA/CPRA',
 'CNIL',
 'COPPA',
 'CPA',
 'CPRA',
 'CalOPPA',
 'California Civil Code 1798.83',
 'California Privacy Rights Act',
 'Consumer Protection Act 2019',
 'Consumer Protection Regulations',
 'DPA',
 'DPA IRR',
 'DPDPA',
 'DPF',
 'DSGVO',
 'Data Protection Act',
 'Data Protection Act 2018',
 'Data Protection Act 2019',
 'Data Protection Laws',
 'Digital Personal Data Protection Act 2023',
 'EU-DSGVO',
 'EU-US Privacy Shield',
 'FDPA',
 'FERPA',
 'Federal Law on the Protection of Personal Data Held by Private Parties',
 'GDPR',
 'GINA',
 'Google API Services User Data Policy',
 'HIPAA',
 'HITECH',
 'INDIAN LAWS',
 'Indian Companies Act 2013',
 'Indian Information Technology Act of 2000',
 'Information Technology (Intermediary Guidelines and Digital Media Ethics Code) Rules 2021',
 'Information Technology Act 2000',
 'JIS Q1500

In [30]:
no_compliance = mhealth_apps_final[mhealth_apps_final['a11'] == 'Not mentioned']
print("Apps do not mentioned laws or regulations:", len(no_compliance))
no_compliance_count = no_compliance.groupby('categories').size()
no_compliance_count

Apps do not mentioned laws or regulations: 211


categories
Beauty                 1
Books & Reference      1
Business               1
Education              4
Food & Drink           2
Health & Fitness     123
Lifestyle             10
Medical               52
Music & Audio          1
Parenting              9
Productivity           2
Shopping               3
Simulation             1
Tools                  1
dtype: int64

In [31]:
us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(no_compliance)
filtered_no_compliance = pd.concat([us_filtered_df, europe_filtered_df], ignore_index=True)
print('Average downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('Average downloads in RoW:', row_filtered_df['downloads'].mean())

Top 5 countries:
countries
sd    101
mn     99
cu     99
mm     97
cf     96
Name: count, dtype: int64
Number of apps available in USA: 67
Number of apps available in Europe: 140
Number of apps available in RoW: 201
Average downloads in USA: 3135897.268656716
Average downloads in Europe: 2840275.1642857143
Average downloads in RoW: 3854093.109452736


In [32]:
noncompliance_apps = filtered_no_compliance[
    filtered_no_compliance[['data_shared', 'data_collected', 'PII', 'PHI', 'PII&PHI']].isnull().all(axis=1)
]
noncompliance_apps

Unnamed: 0,app_name,app_id,categories,content_rating,updated_on,downloads,free,offersIAP,top_grossing,is_privacy_policy,...,a4_note,a5_note,a6_note,a7_note,a8_note,a9_note,a10_note,a11_note,a12_note,country_split


In [33]:
print("Apps with no compliance:\n")
apps_sharing = filtered_no_compliance[filtered_no_compliance['data_shared'].notnull()]
print('Sharing data:', len(apps_sharing))

apps_collecting = filtered_no_compliance[filtered_no_compliance['data_collected'].notnull()]
print('Collecting data:', len(apps_collecting))

apps_pii = filtered_no_compliance[filtered_no_compliance['PII'].notnull()]
print('Collecting/sharing PII:', len(apps_pii))

apps_phi = filtered_no_compliance[filtered_no_compliance['PHI'].notnull()]
print('Collecting/sharing PHI:', len(apps_phi))

apps_pii_phi = filtered_no_compliance[filtered_no_compliance['PII&PHI'].notnull()]
print('Collecting/sharing PII&PHI:', len(apps_pii_phi))

Apps with no compliance:

Sharing data: 87
Collecting data: 207
Collecting/sharing PII: 36
Collecting/sharing PHI: 18
Collecting/sharing PII&PHI: 137


In [34]:
law_mentioned_apps = mhealth_apps_final[mhealth_apps_final['a11'] != 'Not mentioned']
print(f'Number of apps mentioning laws in their privacy policies: {len(law_mentioned_apps)}\n')

us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(law_mentioned_apps)
law_mentioned_apps = pd.concat([us_filtered_df, europe_filtered_df], ignore_index=True)
print('Average downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('Average downloads in RoW:', row_filtered_df['downloads'].mean())

Number of apps mentioning laws in their privacy policies: 645

Top 5 countries:
countries
ca    323
th    322
ru    314
uk    312
vn    311
Name: count, dtype: int64
Number of apps available in USA: 293
Number of apps available in Europe: 464
Number of apps available in RoW: 610
Average downloads in USA: 21380376.515358362
Average downloads in Europe: 30397379.015086208
Average downloads in RoW: 29927035.144262295


In [35]:
unique_laws_us = extract_unique_item(us_filtered_df['a11'])
print('Total unique laws:', len(unique_laws_us))

no_HIPAA = us_filtered_df[~us_filtered_df['a11'].str.contains('HIPAA', case=False, na=False)]
print(f'Apps that do not mention HIPAA: {len(no_HIPAA)} ({len(no_HIPAA) / len(us_filtered_df) * 100:.2f}%)')


Total unique laws: 40
Apps that do not mention HIPAA: 252 (86.01%)


In [36]:
print("Apps accessible in USA:\n")

apps_sharing = no_HIPAA[no_HIPAA['data_shared'].notnull()]
print('Sharing data:', len(apps_sharing))

apps_collecting = no_HIPAA[no_HIPAA['data_collected'].notnull()]
print('Collecting data:', len(apps_collecting))

apps_pii = no_HIPAA[no_HIPAA['PII'].notnull()]
print('Collecting/sharing PII:', len(apps_pii))

apps_phi = no_HIPAA[no_HIPAA['PHI'].notnull()]
print('Collecting/sharing PHI:', len(apps_phi))

apps_pii_phi = no_HIPAA[no_HIPAA['PII&PHI'].notnull()]
print('Collecting/sharing PII&PHI:', len(apps_pii_phi))

Apps accessible in USA:

Sharing data: 115
Collecting data: 200
Collecting/sharing PII: 43
Collecting/sharing PHI: 18
Collecting/sharing PII&PHI: 179


In [37]:
unique_laws_europe = extract_unique_item(europe_filtered_df['a11'])
print('Total unique laws:', len(unique_laws_europe))
no_GDPR = europe_filtered_df[~europe_filtered_df['a11'].str.contains('GDPR', case=False, na=False)]
print(f'Apps that do not mention GDPR: {len(no_GDPR)} ({len(no_GDPR) / len(europe_filtered_df) * 100:.2f}%)')


Total unique laws: 61
Apps that do not mention GDPR: 202 (43.53%)


In [38]:
print("Apps accessible in Europe:\n")

apps_sharing = no_GDPR[no_GDPR['data_shared'].notnull()]
print('Sharing data:', len(apps_sharing))

apps_collecting = no_GDPR[no_GDPR['data_collected'].notnull()]
print('Collecting data:', len(apps_collecting))

apps_pii = no_GDPR[no_GDPR['PII'].notnull()]
print('Collecting/sharing PII:', len(apps_pii))

apps_phi = no_GDPR[no_GDPR['PHI'].notnull()]
print('Collecting/sharing PHI:', len(apps_phi))

apps_pii_phi = no_GDPR[no_GDPR['PII&PHI'].notnull()]
print('Collecting/sharing PII&PHI:', len(apps_pii_phi))

Apps accessible in Europe:

Sharing data: 66
Collecting data: 97
Collecting/sharing PII: 34
Collecting/sharing PHI: 15
Collecting/sharing PII&PHI: 126


# COPPA Compliance Assessment

In [39]:
irrelevant_data = ['Teen', 'Mature 17+', 'Rated 12+', 'Rated for 12+', 'Rated for 18+']
apps_coppa = mhealth_apps_final[
    mhealth_apps_final['content_rating'].notna() & 
    ~mhealth_apps_final['content_rating'].isin(irrelevant_data)
]
print('Number of apps under COPPA:', len(apps_coppa))

Number of apps under COPPA: 798


In [40]:
a10_children = apps_coppa['a10'].value_counts()

print('Average downloads of Detailed:', apps_coppa[apps_coppa['a10'] == 'Detailed']['downloads'].mean())
print('Average downloads of Brief:', apps_coppa[apps_coppa['a10'] == 'Brief']['downloads'].mean())
print('Average downloads of Not mentioned:', apps_coppa[apps_coppa['a10'] == 'Not mentioned']['downloads'].mean())

a10_children

Average downloads of Detailed: 21723122.026666667
Average downloads of Brief: 43772976.65
Average downloads of Not mentioned: 14078218.893536123


a10
Brief            300
Not mentioned    263
Detailed          75
Name: count, dtype: int64

In [41]:
no_coppa_apps = apps_coppa[
    (apps_coppa['a10'] == 'Not mentioned') & 
    (
        (apps_coppa['data_shared'].notnull()) | 
        (apps_coppa['data_collected'].notnull()) | 
        (apps_coppa['a1'] == 'Yes') | 
        (apps_coppa['a3'] == 'Yes') | 
        (apps_coppa['PII'].notnull()) | 
        (apps_coppa['PHI'].notnull()) | 
        (apps_coppa['PII&PHI'].notnull())
    )
]

print('Number of apps sharing/collecting data but failed to address COPPA:', len(no_coppa_apps))
us_filtered_df, europe_filtered_df, row_filtered_df= filter_data(no_coppa_apps)
print('\nAverage downloads in USA:', us_filtered_df['downloads'].mean())
print('Average downloads in Europe:', europe_filtered_df['downloads'].mean())
print('\nAverage downloads in RoW:', row_filtered_df['downloads'].mean())


Number of apps sharing/collecting data but failed to address COPPA: 263
Top 5 countries:
countries
md    109
ua    109
uk    109
br    109
ca    108
Name: count, dtype: int64
Number of apps available in USA: 96
Number of apps available in Europe: 171
Number of apps available in RoW: 243

Average downloads in USA: 34129237.239583336
Average downloads in Europe: 20131612.485380117

Average downloads in RoW: 15146302.880658437


In [42]:
no_coppa_apps_category = no_coppa_apps['categories'].value_counts()
no_coppa_apps_category

categories
Health & Fitness     175
Medical               53
Parenting             12
Lifestyle              9
Education              3
Business               3
Shopping               3
Productivity           2
Books & Reference      1
Beauty                 1
Communication          1
Name: count, dtype: int64

# Q12: How would you rate the policy disclosure?

In [43]:
print('\nRate of the policy disclosure:')
a12_disclosure = mhealth_apps_final['a12'].value_counts()
a12_disclosure


Rate of the policy disclosure:


a12
Vague      554
Omitted    176
Clear      126
Name: count, dtype: int64