In [46]:
import pandas as pd

def process_category_data(dataframe, top_n=5):
    # Step 1: Calculate raw category counts
    category_counts_raw = dataframe.groupby('categories').size().reset_index(name='total_count')
    category_counts_raw = category_counts_raw.sort_values(by='total_count', ascending=False).reset_index(drop=True)

    # Step 2: Define aggregation logic
    agg_logic = {
        'RQ1': ('a1', lambda x: round(
            ((x == 'Yes').sum() + ((x == 'No') & dataframe['data_collected'].isnull()).sum()) / len(x), 2)),
        'RQ2': ('a3', lambda x: round(
            ((x == 'Yes').sum() + ((x == 'No') & dataframe['data_shared'].isnull()).sum()) / len(x), 2)),
        'RQ3': ('a6', lambda x: round(((x != 'Not at all').sum() / len(x)), 2)),
        'RQ4': ('a7', lambda x: round(((x != 'Encrypted').sum() / len(x)), 2)),
        'RQ5': ('a8', lambda x: round(((x != 'Not mentioned').sum() / len(x)), 2)),
        'RQ6': ('a9', lambda x: round(((x == 'Yes').sum() / len(x)), 2)),
        'RQ7': ('a10', lambda x: round(((x != 'Not mentioned').sum() / len(x)), 2)),
        'RQ8': ('a12', lambda x: round(((x != 'Omitted').sum() / len(x)), 2)),
        'RQ9': ('a11', lambda x: round(((x != 'Not mentioned').sum() / len(x)), 2))
    }
    
    # Step 3: Aggregate initial statistics
    RQ_counts = dataframe.groupby('categories').agg(**agg_logic).reset_index()
    RQ_counts = RQ_counts.merge(category_counts_raw, on='categories')
    RQ_counts = RQ_counts.sort_values(by='total_count', ascending=False).reset_index(drop=True)

    # Step 4: Identify top categories
    top_categories = RQ_counts['categories'].head(top_n)
    
    # Step 5: Categorize remaining into "Others"
    dataframe.loc[:, 'categories'] = dataframe['categories'].apply(
        lambda x: x if x in top_categories.values else 'Others'
    )
    
    # Step 6: Recalculate statistics with "Others" category
    RQ_counts = dataframe.groupby('categories').agg(
        **agg_logic,
        total_count=('categories', 'size')
    ).reset_index()
    RQ_counts = RQ_counts.sort_values(by='total_count', ascending=False).reset_index(drop=True)
    
    return RQ_counts

In [47]:
mhealth_apps_final = pd.read_csv('../data/mhealth_apps_final.csv')
mhealth_processed = process_category_data(mhealth_apps_final)
mhealth_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.79,0.79,0.94,0.74,0.89,0.8,0.65,0.82,0.77,536
1,Medical,0.69,0.74,0.88,0.67,0.92,0.73,0.69,0.75,0.71,180
2,Others,0.79,0.75,0.96,0.75,0.94,0.75,0.75,0.79,0.73,48
3,Parenting,0.86,0.79,0.86,0.86,0.88,0.84,0.6,0.88,0.79,43
4,Education,0.36,0.36,0.88,0.96,0.88,0.32,0.88,0.36,0.84,25
5,Lifestyle,0.75,0.79,0.83,0.83,0.88,0.79,0.62,0.79,0.58,24


# Case Study - CDC Apps

In [48]:
cdc_apps = mhealth_apps_final[mhealth_apps_final['app_id'].str.contains('cdc', na=False)]
print('Number of CDC apps:', len(cdc_apps))
print('Average downloads:', cdc_apps['downloads'].mean())

Number of CDC apps: 2
Average downloads: 651178.5


In [49]:
import os

privacy_policies_dir = '../data/privacy_policies'

cdc_policies = []

for app_id in cdc_apps['app_id']:
    for filename in os.listdir(privacy_policies_dir):
        if app_id in filename:
            cdc_policies.append(filename)

print('Found privacy policies:', len(cdc_policies))


Found privacy policies: 2


In [50]:
cdc_apps_processed = process_category_data(cdc_apps)
cdc_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2


# Impact Factors Analysis

# Paid Apps

In [51]:
paid_apps = mhealth_apps_final[mhealth_apps_final['free'] == False]
print('Total paid apps:', len(paid_apps))

Total paid apps: 25


In [52]:
import os

traffic_data_dir = '../data/network_traffic'

paid_app_traffic = []

for app_id in paid_apps['app_id']:
    for filename in os.listdir(traffic_data_dir):
        if app_id in filename:
            app_data = paid_apps[paid_apps['app_id'] == app_id]
            if app_data['PII'].isna().all() and \
               app_data['PHI'].isna().all() and \
               app_data['PII&PHI'].isna().all() and \
               app_data['Connectivity Issues'].eq('No Content').all():
                continue
            paid_app_traffic.append(filename)

print('Total paid apps with traffic data:', len(paid_app_traffic))


Total paid apps with traffic data: 21


In [53]:
paid_apps_processed = process_category_data(paid_apps)
paid_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.27,0.33,0.93,1.0,0.87,0.33,0.87,0.33,0.87,15
1,Medical,0.29,0.43,0.86,0.86,0.86,0.43,0.86,0.29,0.71,7
2,Education,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1
3,Lifestyle,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1
4,Others,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [54]:
free_apps = mhealth_apps_final[mhealth_apps_final['free'] == True]
print('Total paid apps:', len(free_apps))

Total paid apps: 831


In [55]:
free_apps_processed = process_category_data(free_apps)
free_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.8,0.8,0.94,0.74,0.89,0.81,0.64,0.84,0.77,521
1,Medical,0.71,0.75,0.88,0.66,0.92,0.74,0.68,0.77,0.71,173
2,Others,0.79,0.77,0.91,0.77,0.91,0.77,0.7,0.8,0.67,70
3,Parenting,0.86,0.79,0.86,0.86,0.88,0.84,0.6,0.88,0.79,43
4,Education,0.38,0.38,0.88,0.96,0.88,0.33,0.88,0.38,0.83,24


# Top-grossing Apps

In [56]:
top_grossing_apps = mhealth_apps_final[mhealth_apps_final['top_grossing'] == 'Yes']
print('Total paid apps:', len(top_grossing_apps))

Total paid apps: 188


In [57]:
top_grossing_apps_processed = process_category_data(top_grossing_apps)
top_grossing_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.87,0.85,0.92,0.69,0.88,0.9,0.69,0.92,0.8,137
1,Medical,0.83,1.0,0.87,0.48,0.83,0.96,0.52,0.96,0.78,23
2,Parenting,0.73,0.67,0.87,0.87,0.93,0.67,0.67,0.73,0.93,15
3,Others,1.0,1.0,1.0,0.67,0.89,1.0,0.56,1.0,0.67,9
4,Lifestyle,1.0,1.0,0.75,0.5,1.0,1.0,0.25,1.0,1.0,4


In [58]:
non_top_grossing_apps = mhealth_apps_final[mhealth_apps_final['top_grossing'] != 'Yes']
print('Total paid apps:', len(non_top_grossing_apps))

Total paid apps: 668


In [59]:
non_top_grossing_apps_processed = process_category_data(non_top_grossing_apps)
non_top_grossing_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.76,0.76,0.94,0.76,0.89,0.77,0.63,0.79,0.76,399
1,Medical,0.67,0.7,0.89,0.7,0.93,0.69,0.71,0.72,0.7,157
2,Others,0.73,0.72,0.92,0.82,0.9,0.72,0.75,0.75,0.65,60
3,Parenting,0.93,0.86,0.86,0.86,0.86,0.93,0.57,0.96,0.71,28
4,Education,0.33,0.33,0.88,0.96,0.92,0.29,0.92,0.33,0.88,24


# Impact Factors

In [60]:
def filter_data(df):
    usa = ['us']
    europe = [
        'al', 'ad', 'am', 'at', 'az', 'by', 'be', 'ba', 'bg', 'hr', 'cy', 'cz', 'dk', 'ee',
        'fi', 'fr', 'ge', 'de', 'gi', 'gr', 'hu', 'is', 'ie', 'it', 'kz', 'xk', 'lv', 'li',
        'lt', 'lu', 'mt', 'md', 'mc', 'me', 'nl', 'mk', 'no', 'pl', 'pt', 'ro', 'ru', 'sm',
        'rs', 'sk', 'si', 'es', 'se', 'ch', 'ua', 'gb', 'va', 'uk'
    ]
    def country_filter(countries, country_set):
        return any(country.lower() in country_set for country in countries)

    countries_list = df['countries'].str.split(', ').explode()
    usa_europe = set(usa + europe)
    rest_of_world = set(countries_list) - usa_europe
    
    df['country_split'] = df['countries'].str.split(', ')
    us_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, usa))]
    europe_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, europe))]
    row_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, rest_of_world))]

    print('Number of apps available in USA:', len(us_filtered_df))
    print('Number of apps available in Europe:', len(europe_filtered_df))
    print('Number of apps available in RoW:', len(row_filtered_df))
    
    return us_filtered_df, europe_filtered_df, row_filtered_df


In [61]:
us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(mhealth_apps_final)

Number of apps available in USA: 360
Number of apps available in Europe: 604
Number of apps available in RoW: 811


In [62]:
us_apps_processed = process_category_data(us_filtered_df, 8)
us_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.86,0.86,0.97,0.71,0.93,0.88,0.69,0.9,0.82,207
1,Medical,0.7,0.74,0.92,0.71,0.95,0.73,0.73,0.75,0.79,115
2,Parenting,0.85,0.7,0.85,0.85,0.85,0.85,0.55,0.9,0.8,20
3,Lifestyle,1.0,1.0,0.75,0.62,0.88,1.0,0.5,1.0,0.75,8
4,Others,0.83,0.83,1.0,0.5,1.0,0.83,0.67,0.83,1.0,6
5,Education,0.25,0.25,1.0,0.75,1.0,0.25,1.0,0.25,1.0,4


In [63]:
europe_apps_processed = process_category_data(europe_filtered_df, 8)
europe_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.81,0.81,0.94,0.72,0.92,0.83,0.68,0.85,0.79,366
1,Medical,0.71,0.75,0.89,0.66,0.93,0.74,0.71,0.75,0.71,150
2,Parenting,0.84,0.74,0.84,0.84,0.87,0.84,0.55,0.87,0.77,31
3,Others,0.84,0.8,0.96,0.8,0.92,0.8,0.76,0.84,0.76,25
4,Education,0.42,0.42,0.84,0.95,0.89,0.37,0.89,0.42,0.84,19
5,Lifestyle,0.85,1.0,0.77,0.77,0.85,0.92,0.62,0.92,0.62,13


In [64]:
row_apps_processed = process_category_data(row_filtered_df, 8)
row_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.79,0.78,0.94,0.75,0.89,0.8,0.66,0.82,0.77,508
1,Medical,0.67,0.73,0.88,0.68,0.91,0.71,0.71,0.74,0.7,171
2,Others,0.78,0.73,0.96,0.76,0.93,0.73,0.76,0.78,0.71,45
3,Parenting,0.85,0.78,0.85,0.88,0.88,0.83,0.61,0.88,0.78,41
4,Lifestyle,0.75,0.79,0.83,0.83,0.88,0.79,0.62,0.79,0.58,24
5,Education,0.36,0.36,0.91,0.95,0.86,0.32,0.86,0.36,0.86,22
