<a href="https://colab.research.google.com/github/jzheng23/colab/blob/main/ScreenTime_data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook can perform the following operations in order:

1. Import data from Firebase and Qualtrics directly with API
2. Save the data frames as temporary csv files
3. Open the temporary csv files and save them to google drive

# Preparation

Mount Google Drive and set up file path

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the Google Drive path, depending who is running the notebook

In [2]:
#Jian
google_drive_data_path = '/content/drive/MyDrive/Problematic smartphone usage/Ambient display/Data'
google_drive_key_path = '/content/drive/MyDrive/Problematic smartphone usage/Ambient display/Key'

In [None]:
#Subin
google_drive_data_path = '/content/drive/MyDrive/UMD_research/Problematic_Smartphone_Usage'
google_drive_key_path = '/content/drive/MyDrive/UMD_research/Problematic_Smartphone_Usage'

# Firebase database

In [5]:
import firebase_admin
from firebase_admin import credentials, db
import csv
import pandas as pd

In [7]:
# Don't re-run this cell unless the kernel has been restarted
cred = credentials.Certificate(google_drive_key_path+'/timer-42ad2-firebase-adminsdk-4r7oj-2c373565f2.json')
firebase_admin.initialize_app(cred, {
    'databaseURL': 'https://timer-42ad2-default-rtdb.firebaseio.com'
})

<firebase_admin.App at 0x7aac3c30a7a0>

In [11]:
#Convert Unix timestamp to date/datetime in specified timezone
def convert_unix_to_date(unix_timestamp, timezone='America/New_York', return_date=True):
   """

   Parameters:
       unix_timestamp: Unix timestamp in milliseconds
       timezone: String of timezone (default 'America/New_York')
       return_date: If True returns date only, if False returns datetime
   """
   dt = pd.to_datetime(unix_timestamp, unit='ms', utc=True).tz_convert(timezone)
   return dt.date() if return_date else dt

## Response data

In [22]:
def process_responses(responses, pid, host, survey_id):
    responses_data = []
    response_dict = {
        'pid': pid,
        'host': host,
        'surveyID': survey_id
    }

    if isinstance(responses, dict):
        for question_id, answer in responses.items():
            response_dict[f'q_{question_id}'] = answer
    elif isinstance(responses, list):
        for question_id, answer in enumerate(responses):
            if answer is not None:
                response_dict[f'q_{question_id}'] = answer

    responses_data.append(response_dict)
    return responses_data

def get_timestamp(data):
    if isinstance(data, dict):
        return data.get('a') or data.get('timestamp')
    return None

def get_responses(data):
    if isinstance(data, dict):
        return data.get('b') or data.get('responses')
    elif isinstance(data, list):
        return data
    return None

def firebase_to_csv_log_Screen(ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    for pid, pid_data in data.items():
        pid = pid.lower()

        # Process each host under the pid
        for host, events in pid_data.items():
            if isinstance(events, list):
                for index, event in enumerate(events):
                    if event is not None and isinstance(event, dict):
                        row = {
                            'pid': pid,
                            'host': host,
                            'eventLabel': index,
                            'startTime': event.get('startTime'),
                            'endTime': event.get('endTime')
                        }
                        transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    if len(transformed_data) > 0:
        df = df[['pid', 'host', 'eventLabel', 'startTime', 'endTime']]
    df.to_csv(output_file, index=False)

def responses_to_csvs(meta_file, responses_file):
    ref = db.reference('/responses')
    data = ref.get()

    meta_data = []
    responses_data = []

    for pid, pid_data in data.items():
        pid = pid.lower()

        # Process each host under pid
        for host, host_data in pid_data.items():
            # If host_data is a list, enumerate through it
            if isinstance(host_data, list):
                for survey_id, survey_data in enumerate(host_data):
                    if survey_data is not None:
                        timestamp = get_timestamp(survey_data)
                        if timestamp:
                            meta_data.append({
                                'pid': pid,
                                'host': host,
                                'surveyID': str(survey_id),
                                'timestamp': timestamp
                            })

                            responses = get_responses(survey_data)
                            if responses:
                                responses_data.extend(process_responses(responses, pid, host, str(survey_id)))

    # Create and save metadata DataFrame
    meta_df = pd.DataFrame(meta_data)
    if len(meta_data) > 0:
        meta_df = meta_df[['pid', 'host', 'surveyID', 'timestamp']]
    meta_df.to_csv(meta_file, index=False)

    # Create responses DataFrame in wide format
    responses_df = pd.DataFrame(responses_data)

    if len(responses_data) > 0:
        first_cols = ['pid', 'host', 'surveyID']
        q_cols = [col for col in responses_df.columns if col.startswith('q_')]
        q_cols.sort(key=lambda x: int(x.split('_')[1]))
        responses_df = responses_df[first_cols + q_cols]

    responses_df.to_csv(responses_file, index=False)

In [25]:
responses_to_csvs('survey_meta.csv', 'survey_responses.csv')
survey_meta_data = pd.read_csv('/content/survey_meta.csv')
print("The shape of the survey_meta_data is " + str(survey_meta_data.shape))
survey_meta_data.to_csv(google_drive_data_path + '/survey_meta.csv', index=False)
survey_responses_data = pd.read_csv('/content/survey_responses.csv')
print("The shape of the survey_responses_data is " + str(survey_responses_data.shape))
survey_responses_data.to_csv(google_drive_data_path + '/survey_responses.csv', index=False)

The shape of the survey_meta_data is (17, 4)
The shape of the survey_responses_data is (17, 33)


*TODO* still cannot read the host

In the real data there should always be a host

This should work for the real data, not tested yet

Select all the lines and use Ctrl + / to uncomment

In [None]:
# def process_responses(responses, pid, host, survey_id):
#     responses_data = []
#     response_dict = {
#         'pid': pid,
#         'host': host,
#         'surveyID': survey_id
#     }

#     if isinstance(responses, dict):
#         for question_id, answer in responses.items():
#             response_dict[f'q_{question_id}'] = answer
#     elif isinstance(responses, list):
#         for question_id, answer in enumerate(responses):
#             if answer is not None:
#                 response_dict[f'q_{question_id}'] = answer

#     responses_data.append(response_dict)
#     return responses_data

# def get_timestamp(data):
#     if isinstance(data, dict):
#         return data.get('a') or data.get('timestamp')
#     return None

# def get_responses(data):
#     if isinstance(data, dict):
#         return data.get('b') or data.get('responses')
#     elif isinstance(data, list):
#         return data
#     return None

# def responses_to_csvs(meta_file, responses_file):
#     ref = db.reference('/responses')
#     data = ref.get()

#     meta_data = []
#     responses_data = []

#     for pid, pid_data in data.items():
#         if not pid_data:
#             continue

#         # Process each host under pid
#         for host, host_data in pid_data.items():
#             # Process each survey under host
#             for survey_id, survey_data in host_data.items():
#                 timestamp = get_timestamp(survey_data)

#                 meta_data.append({
#                     'pid': pid,
#                     'host': host,
#                     'surveyID': survey_id,
#                     'timestamp': timestamp
#                 })

#                 responses = get_responses(survey_data)
#                 if responses:
#                     responses_data.extend(process_responses(responses, pid, host, survey_id))

#     # Create and save metadata DataFrame
#     meta_df = pd.DataFrame(meta_data)
#     meta_df = meta_df[['pid', 'host', 'surveyID', 'timestamp']]  # ensure column order
#     meta_df.to_csv(meta_file, index=False)

#     # Create responses DataFrame in wide format
#     responses_df = pd.DataFrame(responses_data)

#     # Ensure the first columns are in the correct order
#     first_cols = ['pid', 'host', 'surveyID']

#     # Get question columns and sort them numerically
#     q_cols = [col for col in responses_df.columns if col.startswith('q_')]
#     q_cols.sort(key=lambda x: int(x.split('_')[1]))

#     # Combine columns in correct order
#     responses_df = responses_df[first_cols + q_cols]

#     responses_df.to_csv(responses_file, index=False)

## Screen events

In [3]:
# def firebase_to_csv_log_Screen (ref_path, output_file):
#     ref = db.reference(ref_path)
#     data = ref.get()
#     transformed_data = []

#     # dictionary structure
#     for pid, events in data.items():  # 'pid' = key
#         pid = pid.lower()
#         if isinstance(events, list):
#             for index, event in enumerate(events, start=0):  #event_label_start:1
#                 if event is None:
#                     continue
#                 if isinstance(event, dict):
#                     row = {'pid': pid,
#                         'eventLabel': index,
#                         'startTime': event.get('startTime'),
#                         'endTime': event.get('endTime')}
#                     transformed_data.append(row)

#     df = pd.DataFrame(transformed_data)
#     df = df[['pid', 'eventLabel', 'startTime', 'endTime']]
#     df.to_csv(output_file, index=False)

In [9]:
def firebase_to_csv_log_Screen(ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    # dictionary structure
    for pid, pid_data in data.items():
        pid = pid.lower()

        # Skip if pid_data is not a dictionary (to handle old format entries)
        if not isinstance(pid_data, dict):
            continue

        # Process each host under the pid
        for host, events in pid_data.items():
            if not isinstance(events, list):
                continue

            for index, event in enumerate(events, start=0):
                if event is None:
                    continue
                if isinstance(event, dict):
                    row = {
                        'pid': pid,
                        'host': host,
                        'eventLabel': index,
                        'startTime': event.get('startTime'),
                        'endTime': event.get('endTime')
                    }
                    transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    if len(transformed_data) > 0:
        df = df[['pid', 'host', 'eventLabel', 'startTime', 'endTime']]
    df.to_csv(output_file, index=False)

In [10]:
firebase_to_csv_log_Screen('/screen_events','screen_events.csv')
screen_events_data = pd.read_csv('/content/screen_events.csv')
print("The shape of the screen_events_data is " + str(screen_events_data.shape))
screen_events_data.to_csv(google_drive_data_path + '/screen_events.csv', index=False)

The shape of the screen_events_data is (1340, 5)


## Setting changes

In [None]:
def firebase_to_csv_log_setting (ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    # dictionary structure
    for pid, changes in data.items():  # 'pid' = key
        if isinstance(changes, list):
            for index, change in enumerate(changes, start=0):  #event_label_start:1
                if change is None:
                    continue
                if isinstance(change, dict):
                    row = {'pid': pid,
                        'eventLabel': index,
                        'newValue' : change.get('newValue'),
                        'setting': change.get('setting'),
                        'timestamp': change.get('timestamp')}
                    transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    df = df[['pid', 'newValue', 'setting', 'timestamp']]
    df.to_csv(output_file, index=False)

In [None]:
firebase_to_csv_log_setting('/settings_change_logs','settings_change_logs.csv')
settings_change_logs_data = pd.read_csv('/content/settings_change_logs.csv')
print("The shape of the settings_change_logs_data is " + str(settings_change_logs_data.shape))
settings_change_logs_data.to_csv(google_drive_data_path + '/settings_change_logs.csv', index=False)

The shape of the settings_change_logs_data is (21, 4)


## Ringer

In [None]:
def firebase_to_csv_log_mode(ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    for pid, middle_levels in data.items():
        pid = pid.lower()
        for middle_key, sub_changes in middle_levels.items():
            for index, event in enumerate(sub_changes):
                if isinstance(event, dict):
                    row = {'pid': pid,'host':middle_key,'label': str(index),
                        'mode': event.get('mode'),
                        'timestamp': event.get('timestamp')}
                    transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    df = df[['pid', 'host', 'label', 'mode', 'timestamp']]
    df.to_csv(output_file, index=False)

In [None]:
firebase_to_csv_log_mode('/ringer_mode_events','ringer_mode_events.csv')
ringer_mode_events_data = pd.read_csv('/content/ringer_mode_events.csv')
print("The shape of the ringer_mode_events_data is " + str(ringer_mode_events_data.shape))
ringer_mode_events_data.to_csv(google_drive_data_path + '/ringer_mode_events.csv', index=False)

In [None]:
# ringer_mode_events_data0['label'] = ringer_mode_events_data0.groupby('pid').cumcount() + 1
# ringer_mode_events_data0

## Device info

In [None]:
# this function works for device and timezone infor
def firebase_to_csv(ref_path, output_file):
    # Get reference to device_info
    ref = db.reference(ref_path)
    data = ref.get()

    transformed_data = []

    # Transform the nested structure
    for pid, pid_data in data.items():
        pid = pid.lower()

        if isinstance(pid_data, str):
            # Case where timezone is directly under pid
            row = {
                'pid': pid,
                'host': pd.NA,  # or None
                'timezone': pid_data
            }
            transformed_data.append(row)
        else:
            # Case where pid has host-timezone pairs
            for host, timezone in pid_data.items():
                row = {
                    'pid': pid,
                    'host': host,
                    'timezone': timezone
                }
                transformed_data.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(transformed_data)

    # Ensure pid and host are the first columns
    cols = ['pid', 'host'] + [col for col in df.columns if col not in ['pid', 'host']]
    df = df[cols]

    # Save to CSV
    df.to_csv(output_file, index=False)

In [None]:
firebase_to_csv('/device_info','device_info.csv')
device_data = pd.read_csv('/content/device_info.csv')
print("The shape of the device_data0 is " + str(device_data.shape))
device_data.to_csv(google_drive_data_path + '/device_info.csv', index=False)

## Timezone info

In [None]:
firebase_to_csv('/timezones','timezones.csv')
timezones_data = pd.read_csv('/content/timezones.csv')
print("The shape of the timezones_data is " + str(timezones_data.shape))
timezones_data.to_csv(google_drive_data_path + '/timezones.csv', index=False)

## Counters

We don't actually need top analyze the counter data

In [None]:
# def firebase_to_csv_counters(ref_path, output_file):
#     ref = db.reference(ref_path)
#     data = ref.get()
#     transformed_data = []
#     for pid, value in data.items():
#         pid = pid.lower()
#         # Case 1: only numeric
#         if isinstance(value, int):
#             transformed_data.append({'pid': pid, 'screen_event_count': value})

#         # Case 2: dictionary => selecting only value
#         elif isinstance(value, dict):
#             numeric_values = [v for v in value.values() if isinstance(v, int)]
#             if numeric_values:
#                 total = sum(numeric_values)
#                 transformed_data.append({'pid': pid, 'screen_event_count': total})

#     df = pd.DataFrame(transformed_data)
#     df = df[['pid', 'screen_event_count']]
#     df.to_csv(output_file, index=False)


# firebase_to_csv_counters('/ringer_event_counters','ringer_event_counters.csv')
# ringer_event_counters_data0 = pd.read_csv('/content/ringer_event_counters.csv')
# print(ringer_event_counters_data0.head())

# firebase_to_csv_counters('/screen_event_counters','screen_event_counters.csv')
# screen_event_counters_data0 = pd.read_csv('/content/screen_event_counters.csv')
# print(screen_event_counters_data0.head())

# firebase_to_csv_counters('/settings_change_counters','settings_change_counters.csv')
# settings_change_counters_data0 = pd.read_csv('/content/settings_change_counters.csv')
# print(settings_change_counters_data0.head())

# Qualtrics data

## Sign-up survey

### Read and save the raw data

In [None]:
import requests
import zipfile
import json
import time
import io

def get_qualtrics_data(api_token, survey_id):
    # API configurations
    base_url = f"https://pdx1.qualtrics.com/API/v3/surveys/{survey_id}/export-responses"
    headers = {
        "X-API-TOKEN": api_token,
        "Content-Type": "application/json"
    }

    # Start export
    export_payload = '{"format":"csv"}'
    export_response = requests.post(base_url, data=export_payload, headers=headers)
    export_progress_id = export_response.json()["result"]["progressId"]

    # Check export progress
    while True:
        progress_response = requests.get(
            f"{base_url}/{export_progress_id}",
            headers=headers
        )
        progress_status = progress_response.json()["result"]["status"]

        if progress_status == "complete":
            file_id = progress_response.json()["result"]["fileId"]
            break
        time.sleep(2)

    # Download file
    download_response = requests.get(
        f"{base_url}/{file_id}/file",
        headers=headers
    )

    # Extract zip file
    with zipfile.ZipFile(io.BytesIO(download_response.content)) as zip_file:
        return zip_file.read(zip_file.namelist()[0]).decode('utf-8')


In [None]:
# sign_up_survey_pilot = "SV_dgN8IwiCIfglbAq"

sign_up_survey = "SV_3RiDob4GtY8kCSG"
api_token = "U5xGlZmJv76LsjIXvfwB7FS9RqrqwmMb3vva3pbD"

In [None]:
# 10S
import pandas as pd

signup_survey_data = get_qualtrics_data(api_token, sign_up_survey) # remove _pilot if running on the real data
# Then save to CSV
with open('signup_survey_data.csv', 'w') as f:
    f.write(signup_survey_data)
    f.close()

signup_survey_data = pd.read_csv('signup_survey_data.csv')
print("The shape of the signup_survey_data is " + str(signup_survey_data.shape))
signup_survey_data.to_csv(google_drive_data_path + '/signup_survey_data_raw.csv', index=False)

The shape of the signup_survey_data is (526, 55)


### Process the data

Get city from location

In [None]:
signup_data0 = pd.read_csv(google_drive_data_path + '/signup_survey_data_raw.csv')
signup_data = signup_data0.copy()
#drop row 2 & 3
signup_data = signup_data.drop([0, 1])

In [None]:
# install the package if missing
!pip install reverse_geocoder

Collecting reverse_geocoder
  Downloading reverse_geocoder-1.5.1.tar.gz (2.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m1.9/2.2 MB[0m [31m27.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: reverse_geocoder
  Building wheel for reverse_geocoder (setup.py) ... [?25l[?25hdone
  Created wheel for reverse_geocoder: filename=reverse_geocoder-1.5.1-py3-none-any.whl size=2268068 sha256=4c59ab0f953297f138c7078a93fd37221708695939f9a598b768980b7cfffd9d
  Stored in directory: /root/.cache/pip/wheels/bd/e5/88/

In [None]:
import reverse_geocoder as rg

def get_country(lat, lon):
    try:
        result = rg.search((lat, lon))
        if result:
            return result[0]['cc']  # returns country code
        return 'Unknown'
    except:
        return 'Unknown'

In [None]:
# get country
signup_data['Country'] = signup_data.apply(
    lambda row: get_country(row['LocationLatitude'], row['LocationLongitude']),
    axis=1
)

Loading formatted geocoded file...


In [None]:
signup_data.shape

(524, 56)

In [None]:
# prompt: in signup_data, drop rows where Q19 is empty

# Drop rows where 'Q19' is empty
signup_data1 = signup_data.dropna(subset=['Q19'])
signup_data1.shape

(462, 56)

In [None]:
# prompt: in signup_data, create a new column "valid" with default value 1, use .loc[]
signup_data1.loc[:, 'valid'] = 1

# prompt: in signup_data, let "valid" be 2 if Country is not US

signup_data1.loc[signup_data1['Country'] != 'US', 'valid'] = 2

# Create a new column 'location_count' that shows how many times each lat/long pair appears
signup_data1['location_count'] = signup_data1.groupby(['LocationLatitude', 'LocationLongitude'])['LocationLatitude'].transform('count')

# prompt: in signup_data1, let "valid" be 3 if location_count is not 1

signup_data1.loc[signup_data1['location_count'] != 1, 'valid'] = 3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signup_data1.loc[:, 'valid'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signup_data1['location_count'] = signup_data1.groupby(['LocationLatitude', 'LocationLongitude'])['LocationLatitude'].transform('count')


In [None]:
# prompt: in signup_data1, if there are rows with the same value in Q19, keep only the first one, delete the duplicated.

# Remove duplicate rows based on 'Q19', keeping the first occurrence
signup_data1 = signup_data1.drop_duplicates(subset=['Q19'], keep='first')


In [None]:

# prompt: in signup_data1, let "valid" be 5 where Q_RecaptchaScore < 0.8 or Q_RelevantIDDuplicate = TRUE

signup_data1.loc[(signup_data1['Q_RecaptchaScore'].astype(float) < 0.8) | (signup_data1['Q_RelevantIDDuplicate'] == "TRUE"), 'valid'] = 5

# prompt: in signup_data1, let "valid" be 5 where Q_RelevantIDFraudScore is float and greater than 0

signup_data1.loc[(signup_data1['Q_RelevantIDFraudScore'].astype(float) > 0), 'valid'] = 5

In [None]:
# prompt: in signup_data1, let "valid" be 1 if the value in the column Q23 contains "jan", "rana", or "wechat", either upper or lower case. I am sure Q23 exits

signup_data1.loc[
    signup_data1['Q23'].str.contains('jan|rana|wechat', case=False, na=False), 'valid'] = 1

    # prompt: in signup_data1, let "valid" be 4 if the value in the column Q23 contains "Facebook", "reddit", or "flyer", either upper or lower case. I am sure Q23 exits

signup_data1.loc[
    signup_data1['Q23'].str.contains('Facebook|reddit|flyer', case=False, na=False), 'valid'] = 4

In [None]:
signup_data1.to_csv(google_drive_data_path + '/signup_survey_data_labeled.csv', index=False)

In [None]:
# First filter for valid rows
signup_data2 = signup_data1[signup_data1['valid'] == 1]

# Create a dictionary from the CSV mapping
column_mapping = {
    'Q3': 'Age',
    'Q5': 'Brand',
    'Q5_5_TEXT': 'Brand_text',
    'Q6': 'Android_version',
    'Q11': 'Screen_time',
    'Q13': 'Gender',
    'Q13_4_TEXT': 'Gender_text',
    'Q14': 'Education',
    'Q15_1': 'Race_white',
    'Q15_2': 'Race_black',
    'Q15_3': 'Race_native',
    'Q15_4': 'Race_Asian',
    'Q15_5': 'Race_Pacific',
    'Q15_6': 'Race_others',
    'Q15_7': 'Race_text',
    'Q16_1': 'Employment_retired',
    'Q16_2': 'Employment_self-employed',
    'Q16_3': 'Employment_employment full-time',
    'Q16_4': 'Employment_employment part-time',
    'Q16_5': 'Employment_voluntary worker',
    'Q16_6': 'Employment_homemaker',
    'Q16_7': 'Employment_student',
    'Q16_8': 'Employment_others',
    'Q16_4_TEXT': 'Employment_hours_text',
    'Q16_8_TEXT': 'Employment_text',
    'Q18': 'Name',
    'Q19': 'Email',
    'Q23': 'Source'
}

# Rename and keep only the specified columns
signup_data2 = signup_data2[column_mapping.keys()].rename(columns=column_mapping)

In [None]:
signup_data2.to_csv(google_drive_data_path + '/signup_survey_data_valid.csv', index=False)

In [None]:
def process_signup_data(df):
   # Create a copy
   df = df.copy()

   # Define mappings
   mappings = {
       'Brand': {'1': 'Samsung', '3': 'Xiaomi', '4': 'Google'},
       'Gender': {'1': 'female', '2': 'male', '3': 'non-binary'},
       'Education': {
           '1': 'some high school or less',
           '2': 'high school or GED',
           '3': 'some college no degree',
           '4': 'associate or technical degree',
           '5': 'bachelor',
           '6': 'graduate or professional',
           '7': 'prefer not to say'
       }
   }

   # Apply mappings with fallback to text columns
   df['Brand'] = df.apply(lambda x: mappings['Brand'].get(x['Brand'], x['Brand_text']), axis=1)
   df['Gender'] = df.apply(lambda x: mappings['Gender'].get(x['Gender'], x['Gender_text']), axis=1)
   df['Education'] = df['Education'].map(mappings['Education'])

   # Combine race columns
   race_cols = ['white', 'black', 'native', 'Asian', 'Pacific', 'others']
   df['Race'] = df.apply(lambda x: ', '.join(
       [race for race, col in zip(race_cols, ['Race_' + r for r in race_cols])
        if x[col] == '1'] +
       ([x['Race_text']] if pd.notna(x['Race_text']) and x['Race_text'] else [])), axis=1)

   # Combine employment columns
   emp_mappings = {
       'Employment_retired': 'retired',
       'Employment_self-employed': 'self-employed',
       'Employment_employment full-time': 'full-time',
       'Employment_employment part-time': 'part-time',
       'Employment_voluntary worker': 'voluntary worker',
       'Employment_homemaker': 'homemaker',
       'Employment_student': 'student',
       'Employment_others': 'others'
   }

   df['Employment'] = df.apply(lambda x: ', '.join(
       [val for col, val in emp_mappings.items() if x[col] == '1'] +
       ([x['Employment_text']] if pd.notna(x['Employment_text']) and x['Employment_text'] else [])), axis=1)

   # Drop original columns and rename
   cols_to_drop = (['Brand_text', 'Gender_text', 'Race_text', 'Employment_text'] +
                   ['Race_' + r for r in race_cols] +
                   list(emp_mappings.keys()))
   df = df.drop(cols_to_drop, axis=1)
   df = df.rename(columns={'Employment_hours_text': 'Part_time_hours'})

   # Reorder columns
   columns_order = [
       'Age', 'Gender', 'Education', 'Race', 'Employment', 'Part_time_hours',
       'Brand', 'Android_version', 'Screen_time',
       'Name', 'Email', 'Source'
   ]
   return df[columns_order]

signup_data3 = process_signup_data(signup_data2)

In [None]:
signup_data3.to_csv(google_drive_data_path + '/signup_survey_data_processed.csv', index=False)

## Consent form

In [None]:
consert_form_survey = "SV_1Y79vGshtWh9FPM"
consent_form_data = get_qualtrics_data(api_token, consert_form_survey)

with open('consent_form_data.csv', 'w') as f:
    f.write(consent_form_data)
    f.close()

consent_form_data = pd.read_csv('consent_form_data.csv')
print("The shape of the consent_form_data is " + str(consent_form_data.shape))
consent_form_data.to_csv(google_drive_data_path + '/consent_form_data.csv', index=False)

# Backup

Unused code. Don't run this cell unless you know what you are doing.

In [None]:
# Ringer mode changes, with mixed data structure. Hopefully we don't need this

# def firebase_to_csv_log_mode(ref_path, output_file):
#     ref = db.reference(ref_path)
#     data = ref.get()
#     transformed_data = []

#     for pid, middle_levels in data.items():
#         pid = pid.lower()
#         for middle_key, sub_changes in middle_levels.items():
#             # process: checking the structure of sub_data (dic or list)
#             if isinstance(sub_changes, dict):
#                 for sub_key, event in sub_changes.items():
#                     if isinstance(event, dict):
#                         row = {'pid': pid, 'host':middle_levels, 'label': sub_key,
#                             'mode': event.get('mode'),
#                             'timestamp': event.get('timestamp')}
#                         transformed_data.append(row)
#             elif isinstance(sub_changes, list):  # sub_changes -> list
#                 for index, event in enumerate(sub_changes):
#                     if isinstance(event, dict):
#                         row = {'pid': pid,'label': str(index),
#                             'mode': event.get('mode'),
#                             'timestamp': event.get('timestamp')}
#                         transformed_data.append(row)

#     df = pd.DataFrame(transformed_data)
#     df = df[['pid', 'label', 'mode', 'timestamp']]
#     df.to_csv(output_file, index=False)

In [None]:
# def get_city_from_coords(lat, lon):
#     result = rg.search((lat, lon))
#     if result:
#         return f"{result[0]['name']}, {result[0]['admin1']}"
#     return "Unknown location"

In [None]:
# from geopy.geocoders import Nominatim
# import time

# def get_location_info(lat, lon):
#     try:
#         geolocator = Nominatim(user_agent="my_app")
#         location = geolocator.reverse((lat, lon))
#         if location:
#             address = location.raw['address']
#             state = address.get('state', 'Unknown')
#             country = address.get('country', 'Unknown')
#             return pd.Series([state, country])
#         return pd.Series(['Unknown', 'Unknown'])
#     except:
#         return pd.Series(['Unknown', 'Unknown'])
#     finally:
#         # Add a small delay to respect rate limits
#         time.sleep(1)

In [None]:
# #calculate city
# signup_data['City'] = signup_data.apply(lambda row: get_city_from_coords(row['LocationLatitude'], row['LocationLongitude']), axis=1)