# Case Study & Impact Factors Analysis

In [12]:
import pandas as pd

def process_category_data(dataframe, top_n=5):
    # Step 1: Calculate raw category counts
    category_counts_raw = dataframe.groupby('categories').size().reset_index(name='total_count')
    category_counts_raw = category_counts_raw.sort_values(by='total_count', ascending=False).reset_index(drop=True)

    # Step 2: Define aggregation logic
    agg_logic = {
        'RQ1': ('a1', lambda x: round(
            (((x == 'Specified') & dataframe['data_collected'].notnull()).sum() + ((x == 'Not mentioned') & dataframe['data_collected'].isnull()).sum()) / len(x), 2)),
        'RQ2': ('a3', lambda x: round(
            (((x == 'Specified') & dataframe['data_shared'].notnull()).sum() + ((x == 'Not mentioned') & dataframe['data_shared'].isnull()).sum()) / len(x), 2)),
        'RQ3': ('a6', lambda x: round(
            ((x != 'Not at all') & dataframe['security_practices'].str.contains('data be deleted')).sum() / len(x), 2)),
        'RQ4': ('a7', lambda x: round(
            (((x == 'Encrypted') & dataframe['security_practices'].str.contains('is encrypted')).sum() + 
             ((x != 'Encrypted') & dataframe['data_shared'].isnull() & dataframe['data_collected'].isnull() 
              & (dataframe['a1'] == 'No') & (dataframe['a3'] == 'No')).sum()) / len(x), 2)),
        'RQ5': ('a8', lambda x: round(((x != 'Not mentioned').sum() / len(x)), 2)),
        'RQ6': ('a9', lambda x: round(((x == 'Yes').sum() / len(x)), 2)),
        'RQ7': ('a10', lambda x: round(((x != 'Not mentioned').sum() / len(x)), 2)),
        'RQ8': ('a12', lambda x: round(((x != 'Omitted').sum() / len(x)), 2)),
        'RQ9': ('a11', lambda x: round(((x != 'Not mentioned').sum() / len(x)), 2))
    }
    
    # Step 3: Aggregate initial statistics
    RQ_counts = dataframe.groupby('categories').agg(**agg_logic).reset_index()
    RQ_counts = RQ_counts.merge(category_counts_raw, on='categories')
    RQ_counts = RQ_counts.sort_values(by='total_count', ascending=False).reset_index(drop=True)

    # Step 4: Identify top categories
    top_categories = RQ_counts['categories'].head(top_n)
    
    # Step 5: Categorize remaining into "Others"
    dataframe.loc[:, 'categories'] = dataframe['categories'].apply(
        lambda x: x if x in top_categories.values else 'Others'
    )
    
    # Step 6: Recalculate statistics with "Others" category
    RQ_counts = dataframe.groupby('categories').agg(
        **agg_logic,
        total_count=('categories', 'size')
    ).reset_index()
    RQ_counts = RQ_counts.sort_values(by='total_count', ascending=False).reset_index(drop=True)
    
    return RQ_counts

In [13]:
mhealth_apps_final = pd.read_csv('../data/mhealth_apps_final.csv')
mhealth_processed = process_category_data(mhealth_apps_final)
mhealth_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.82,0.51,0.74,0.25,0.89,0.94,0.62,0.97,0.6,536
1,Medical,0.72,0.44,0.56,0.24,0.88,0.91,0.62,0.93,0.49,180
2,Others,0.85,0.62,0.65,0.23,0.9,0.9,0.65,1.0,0.56,48
3,Parenting,0.88,0.53,0.77,0.14,0.95,0.95,0.58,0.98,0.72,43
4,Education,0.52,0.44,0.32,0.16,0.84,0.96,0.64,1.0,0.36,25
5,Lifestyle,0.58,0.5,0.54,0.21,0.83,0.92,0.58,0.92,0.42,24


# Case Study - CDC Apps

In [20]:
cdc_apps = mhealth_apps_final[mhealth_apps_final['app_id'].str.contains('cdc', na=False)]
print('Number of CDC apps:', len(cdc_apps))
print('Average downloads:', cdc_apps['downloads'].mean())

Number of CDC apps: 2
Average downloads: 651178.5


In [21]:
import os

privacy_policies_dir = '../data/privacy_policies'

cdc_policies = []

for app_id in cdc_apps['app_id']:
    for filename in os.listdir(privacy_policies_dir):
        if app_id in filename:
            cdc_policies.append(filename)

print('Found privacy policies:', len(cdc_policies))


Found privacy policies: 2


In [22]:
cdc_apps_processed = process_category_data(cdc_apps)
cdc_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.0,0.5,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2


# Impact Factors Analysis

## Paid Apps

In [23]:
paid_apps = mhealth_apps_final[mhealth_apps_final['free'] == False]
print('Total paid apps:', len(paid_apps))

Total paid apps: 25


In [24]:
import os

traffic_data_dir = '../data/network_traffic'

paid_app_traffic = []

for app_id in paid_apps['app_id']:
    for filename in os.listdir(traffic_data_dir):
        if app_id in filename:
            app_data = paid_apps[paid_apps['app_id'] == app_id]
            if app_data['PII'].isna().all() and \
               app_data['PHI'].isna().all() and \
               app_data['PII&PHI'].isna().all() and \
               app_data['Connectivity Issues'].eq('No Content').all():
                continue
            paid_app_traffic.append(filename)

print('Total paid apps with traffic data:', len(paid_app_traffic))


Total paid apps with traffic data: 21


In [25]:
paid_apps_processed = process_category_data(paid_apps)
paid_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.53,0.67,0.13,0.0,0.67,0.8,0.53,0.87,0.4,15
1,Medical,0.43,0.43,0.14,0.14,0.86,0.86,0.57,0.71,0.29,7
2,Education,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1
3,Lifestyle,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1
4,Others,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1


## Free Apps

In [26]:
free_apps = mhealth_apps_final[mhealth_apps_final['free'] == True]
print('Total free apps:', len(free_apps))

Total free apps: 831


In [27]:
free_apps_processed = process_category_data(free_apps)
free_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.83,0.5,0.76,0.26,0.9,0.94,0.62,0.97,0.61,521
1,Medical,0.73,0.45,0.57,0.25,0.88,0.91,0.62,0.94,0.5,173
2,Others,0.77,0.59,0.63,0.23,0.87,0.9,0.63,0.97,0.5,70
3,Parenting,0.88,0.53,0.77,0.14,0.95,0.95,0.58,0.98,0.72,43
4,Education,0.54,0.46,0.33,0.17,0.83,0.96,0.67,1.0,0.33,24


## Top-grossing Apps

In [28]:
top_grossing_apps = mhealth_apps_final[mhealth_apps_final['top_grossing'] == 'Yes']
print('Total top-grossing apps:', len(top_grossing_apps))

Total top-grossing apps: 188


In [29]:
top_grossing_apps_processed = process_category_data(top_grossing_apps)
top_grossing_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.88,0.42,0.8,0.3,0.88,0.96,0.66,0.99,0.71,137
1,Medical,0.83,0.7,0.83,0.48,0.87,0.96,0.57,0.91,0.78,23
2,Parenting,0.8,0.67,0.73,0.13,1.0,0.93,0.67,1.0,0.73,15
3,Others,1.0,0.67,0.67,0.22,0.89,1.0,0.67,1.0,0.67,9
4,Lifestyle,0.5,0.25,0.25,0.5,1.0,0.75,0.5,0.75,0.5,4


## Non-top-grossing Apps

In [30]:
non_top_grossing_apps = mhealth_apps_final[mhealth_apps_final['top_grossing'] != 'Yes']
print('Total non-top-grossing apps:', len(non_top_grossing_apps))

Total non-top-grossing apps: 668


In [31]:
non_top_grossing_apps_processed = process_category_data(non_top_grossing_apps)
non_top_grossing_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.81,0.54,0.72,0.24,0.9,0.93,0.6,0.96,0.57,399
1,Medical,0.7,0.41,0.52,0.21,0.89,0.9,0.63,0.93,0.45,157
2,Others,0.75,0.6,0.63,0.2,0.85,0.9,0.62,0.98,0.48,60
3,Parenting,0.93,0.46,0.79,0.14,0.93,0.96,0.54,0.96,0.71,28
4,Education,0.5,0.42,0.29,0.17,0.88,0.96,0.67,1.0,0.38,24


## Geolocations: US, EU, RoW

In [32]:
import pandas as pd

usa = ['us']

# source: https://www.gdpradvisor.co.uk/gdpr-countries
GDPR_countries = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Estonia', 'Finland', 'France', 
                  'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 
                  'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'United Kingdom']

country_list = pd.read_csv("../data/country_list.csv")

europe = []

for country in GDPR_countries:
    matching_countries = country_list[country_list['country_name'].str.contains(country, na=False)]
    europe.extend(matching_countries['country_code'].tolist())

print(len(europe)-1)
print(europe)

28
['at', 'be', 'bg', 'hr', 'cy', 'cz', 'dk', 'ee', 'fi', 'fr', 'de', 'gr', 'hu', 'ie', 'it', 'lv', 'lt', 'lu', 'mt', 'nl', 'pl', 'pt', 'ro', 'sk', 'si', 'es', 'se', 'uk', 'gb']


In [33]:
def filter_data(df):

    def country_filter(countries, country_set):
        return any(country.lower() in country_set for country in countries)

    countries_list = df['countries'].str.split(', ').explode()
    usa_europe = set(usa + europe)
    rest_of_world = set(countries_list) - usa_europe
    
    df['country_split'] = df['countries'].str.split(', ')
    us_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, usa))]
    europe_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, europe))]
    row_filtered_df = df[df['country_split'].apply(lambda x: country_filter(x, rest_of_world))]

    print('Number of apps available in USA:', len(us_filtered_df))
    print('Number of apps available in Europe:', len(europe_filtered_df))
    print('Number of apps available in RoW:', len(row_filtered_df))
    
    return us_filtered_df, europe_filtered_df, row_filtered_df


In [34]:
us_filtered_df, europe_filtered_df, row_filtered_df = filter_data(mhealth_apps_final)

Number of apps available in USA: 360
Number of apps available in Europe: 390
Number of apps available in RoW: 834


### 1. USA

In [35]:
us_apps_processed = process_category_data(us_filtered_df, 8)
us_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.88,0.53,0.8,0.27,0.92,0.96,0.69,0.98,0.68,207
1,Medical,0.74,0.43,0.57,0.23,0.92,0.92,0.67,0.94,0.54,115
2,Parenting,0.9,0.7,0.8,0.15,0.95,0.95,0.6,0.95,0.65,20
3,Lifestyle,0.5,0.5,0.5,0.38,0.88,0.88,0.62,0.88,0.38,8
4,Others,1.0,0.83,0.83,0.5,1.0,0.83,0.67,1.0,0.83,6
5,Education,0.25,0.75,0.25,0.5,1.0,1.0,0.75,1.0,0.5,4


### 2. Europe

In [36]:
europe_apps_processed = process_category_data(europe_filtered_df, 8)
europe_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.87,0.52,0.79,0.28,0.92,0.96,0.67,0.97,0.67,224
1,Medical,0.78,0.44,0.58,0.26,0.92,0.93,0.66,0.94,0.52,121
2,Parenting,0.9,0.67,0.76,0.19,0.95,0.95,0.57,0.95,0.67,21
3,Lifestyle,0.56,0.56,0.44,0.33,0.89,0.89,0.67,0.89,0.33,9
4,Education,0.38,0.38,0.5,0.38,1.0,1.0,0.88,1.0,0.62,8
5,Others,1.0,0.71,0.57,0.29,1.0,0.86,0.71,1.0,0.71,7


### 3. RoW (Rest of the World)

In [37]:
row_apps_processed = process_category_data(row_filtered_df, 8)
row_apps_processed

Unnamed: 0,categories,RQ1,RQ2,RQ3,RQ4,RQ5,RQ6,RQ7,RQ8,RQ9,total_count
0,Health & Fitness,0.82,0.52,0.74,0.25,0.89,0.94,0.61,0.97,0.6,523
1,Medical,0.71,0.44,0.55,0.24,0.88,0.91,0.63,0.92,0.48,173
2,Others,0.85,0.62,0.64,0.23,0.89,0.89,0.66,1.0,0.55,47
3,Parenting,0.88,0.52,0.76,0.12,0.95,0.95,0.57,0.98,0.71,42
4,Education,0.52,0.44,0.32,0.16,0.84,0.96,0.64,1.0,0.36,25
5,Lifestyle,0.58,0.5,0.54,0.21,0.83,0.92,0.58,0.92,0.42,24
