<a href="https://colab.research.google.com/github/jzheng23/colab/blob/main/ScreenTime_data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook can perform the following operations in order:

1. Import data from Firebase and Qualtrics directly with API
2. Save the data frames as temporary csv files
3. Open the temporary csv files and save them to google drive

# Preparation

Mount Google Drive and set up file path

In [66]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load the Google Drive path, depending who is running the notebook

In [67]:
#Jian
google_drive_data_path = '/content/drive/MyDrive/Problematic smartphone usage/Ambient display/Data'
google_drive_key_path = '/content/drive/MyDrive/Problematic smartphone usage/Ambient display/Key'

In [None]:
#Subin
google_drive_data_path = '/content/drive/MyDrive/UMD_research/Problematic_Smartphone_Usage'
google_drive_key_path = '/content/drive/MyDrive/UMD_research/Problematic_Smartphone_Usage'

# Firebase database

In [68]:
import firebase_admin
from firebase_admin import credentials, db
import csv
import pandas as pd

In [None]:
# Don't re-run this cell unless the kernel has been restarted
cred = credentials.Certificate(google_drive_key_path+'/timer-42ad2-firebase-adminsdk-4r7oj-2c373565f2.json')
firebase_admin.initialize_app(cred, {
    'databaseURL': 'https://timer-42ad2-default-rtdb.firebaseio.com'
})

In [70]:
#Convert Unix timestamp to date/datetime in specified timezone
def convert_unix_to_date(unix_timestamp, timezone='America/New_York', return_date=True):
   """

   Parameters:
       unix_timestamp: Unix timestamp in milliseconds
       timezone: String of timezone (default 'America/New_York')
       return_date: If True returns date only, if False returns datetime
   """
   dt = pd.to_datetime(unix_timestamp, unit='ms', utc=True).tz_convert(timezone)
   return dt.date() if return_date else dt

## Response data

In [71]:
def process_responses(responses, pid, host, survey_id):
    responses_data = []
    response_dict = {
        'pid': pid,
        'host': host,
        'surveyID': survey_id
    }

    if isinstance(responses, dict):
        for question_id, answer in responses.items():
            response_dict[f'q_{question_id}'] = answer
    elif isinstance(responses, list):
        for question_id, answer in enumerate(responses):
            if answer is not None:
                response_dict[f'q_{question_id}'] = answer

    responses_data.append(response_dict)
    return responses_data

def get_timestamp(data):
    if isinstance(data, dict):
        return data.get('a') or data.get('timestamp')
    return None

def get_responses(data):
    if isinstance(data, dict):
        return data.get('b') or data.get('responses')
    elif isinstance(data, list):
        return data
    return None

def is_host_level(data):
    """Check if this dictionary represents a host level"""
    if isinstance(data, dict):
        # Check if any value contains 'responses' or 'timestamp' keys
        for value in data.values():
            if isinstance(value, dict):
                if 'responses' in value or 'timestamp' in value:
                    return True
    return False

In [72]:
def responses_to_csvs(meta_file, responses_file):
    ref = db.reference('/responses')
    data = ref.get()

    meta_data = []
    responses_data = []

    for pid, pid_data in data.items():
        pid = pid.lower()
        if not pid_data:
            continue

        if isinstance(pid_data, list):
            for survey_id, survey_data in enumerate(pid_data):
                if survey_data:
                    timestamp = get_timestamp(survey_data)

                    meta_data.append({
                        'pid': pid,
                        'host': pd.NA,
                        'surveyID': str(survey_id),
                        'timestamp': timestamp
                    })

                    responses = get_responses(survey_data)
                    if responses:
                        responses_data.extend(process_responses(responses, pid, pd.NA, str(survey_id)))
        else:
            # Check each item under pid
            for key1, value1 in pid_data.items():
                if isinstance(value1, dict):
                    if is_host_level(value1):
                        # This is a host
                        host = key1
                        for survey_id, survey_data in value1.items():
                            if isinstance(survey_data, dict):
                                timestamp = get_timestamp(survey_data)

                                meta_data.append({
                                    'pid': pid,
                                    'host': host,
                                    'surveyID': survey_id,
                                    'timestamp': timestamp
                                })

                                responses = get_responses(survey_data)
                                if responses:
                                    responses_data.extend(process_responses(responses, pid, host, survey_id))
                    else:
                        # This is a survey (no host)
                        survey_id = key1
                        survey_data = value1
                        timestamp = get_timestamp(survey_data)

                        meta_data.append({
                            'pid': pid,
                            'host': pd.NA,
                            'surveyID': survey_id,
                            'timestamp': timestamp
                        })

                        responses = get_responses(survey_data)
                        if responses:
                            responses_data.extend(process_responses(responses, pid, pd.NA, survey_id))

    # Create and save metadata DataFrame
    meta_df = pd.DataFrame(meta_data)
    meta_df = meta_df[['pid', 'host', 'surveyID', 'timestamp']]  # ensure column order
    meta_df.to_csv(meta_file, index=False)

    # Create responses DataFrame in wide format
    responses_df = pd.DataFrame(responses_data)

    # Ensure the first columns are in the correct order
    first_cols = ['pid', 'host', 'surveyID']

    # Get question columns and sort them numerically
    q_cols = [col for col in responses_df.columns if col.startswith('q_')]
    q_cols.sort(key=lambda x: int(x.split('_')[1]))  # Sort by the number after 'q_'

    # Combine columns in correct order
    responses_df = responses_df[first_cols + q_cols]

    responses_df.to_csv(responses_file, index=False)

In [89]:
# Example usage
responses_to_csvs('survey_meta.csv', 'survey_responses.csv')

survey_meta_data = pd.read_csv('/content/survey_meta.csv')
print("The shape of the survey_meta_data is " + str(survey_meta_data.shape))
survey_meta_data.to_csv(google_drive_data_path + '/survey_meta.csv', index=False)
survey_responses_data = pd.read_csv('/content/survey_responses.csv')
print("The shape of the survey_responses_data is " + str(survey_responses_data.shape))
survey_responses_data.to_csv(google_drive_data_path + '/survey_responses.csv', index=False)

The shape of the survey_meta_data is (18, 4)
The shape of the survey_responses_data is (18, 31)


*TODO* still cannot read the host

In the real data there should always be a host

This should work for the real data, not tested yet

Select all the lines and use Ctrl + / to uncomment

In [74]:
# def process_responses(responses, pid, host, survey_id):
#     responses_data = []
#     response_dict = {
#         'pid': pid,
#         'host': host,
#         'surveyID': survey_id
#     }

#     if isinstance(responses, dict):
#         for question_id, answer in responses.items():
#             response_dict[f'q_{question_id}'] = answer
#     elif isinstance(responses, list):
#         for question_id, answer in enumerate(responses):
#             if answer is not None:
#                 response_dict[f'q_{question_id}'] = answer

#     responses_data.append(response_dict)
#     return responses_data

# def get_timestamp(data):
#     if isinstance(data, dict):
#         return data.get('a') or data.get('timestamp')
#     return None

# def get_responses(data):
#     if isinstance(data, dict):
#         return data.get('b') or data.get('responses')
#     elif isinstance(data, list):
#         return data
#     return None

# def responses_to_csvs(meta_file, responses_file):
#     ref = db.reference('/responses')
#     data = ref.get()

#     meta_data = []
#     responses_data = []

#     for pid, pid_data in data.items():
#         if not pid_data:
#             continue

#         # Process each host under pid
#         for host, host_data in pid_data.items():
#             # Process each survey under host
#             for survey_id, survey_data in host_data.items():
#                 timestamp = get_timestamp(survey_data)

#                 meta_data.append({
#                     'pid': pid,
#                     'host': host,
#                     'surveyID': survey_id,
#                     'timestamp': timestamp
#                 })

#                 responses = get_responses(survey_data)
#                 if responses:
#                     responses_data.extend(process_responses(responses, pid, host, survey_id))

#     # Create and save metadata DataFrame
#     meta_df = pd.DataFrame(meta_data)
#     meta_df = meta_df[['pid', 'host', 'surveyID', 'timestamp']]  # ensure column order
#     meta_df.to_csv(meta_file, index=False)

#     # Create responses DataFrame in wide format
#     responses_df = pd.DataFrame(responses_data)

#     # Ensure the first columns are in the correct order
#     first_cols = ['pid', 'host', 'surveyID']

#     # Get question columns and sort them numerically
#     q_cols = [col for col in responses_df.columns if col.startswith('q_')]
#     q_cols.sort(key=lambda x: int(x.split('_')[1]))

#     # Combine columns in correct order
#     responses_df = responses_df[first_cols + q_cols]

#     responses_df.to_csv(responses_file, index=False)

## Screen events

In [75]:
def firebase_to_csv_log_Screen (ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    # dictionary structure
    for pid, events in data.items():  # 'pid' = key
        pid = pid.lower()
        if isinstance(events, list):
            for index, event in enumerate(events, start=0):  #event_label_start:1
                if event is None:
                    continue
                if isinstance(event, dict):
                    row = {'pid': pid,
                        'eventLabel': index,
                        'startTime': event.get('startTime'),
                        'endTime': event.get('endTime')}
                    transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    df = df[['pid', 'eventLabel', 'startTime', 'endTime']]
    df.to_csv(output_file, index=False)

In [90]:
firebase_to_csv_log_Screen('/screen_events','screen_events.csv')
screen_events_data = pd.read_csv('/content/screen_events.csv')
print("The shape of the screen_events_data is " + str(screen_events_data.shape))
screen_events_data.to_csv(google_drive_data_path + '/screen_events.csv', index=False)

The shape of the screen_events_data is(1304, 4)


## Setting changes

In [77]:
def firebase_to_csv_log_setting (ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    # dictionary structure
    for pid, changes in data.items():  # 'pid' = key
        if isinstance(changes, list):
            for index, change in enumerate(changes, start=0):  #event_label_start:1
                if change is None:
                    continue
                if isinstance(change, dict):
                    row = {'pid': pid,
                        'eventLabel': index,
                        'newValue' : change.get('newValue'),
                        'setting': change.get('setting'),
                        'timestamp': change.get('timestamp')}
                    transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    df = df[['pid', 'newValue', 'setting', 'timestamp']]
    df.to_csv(output_file, index=False)

In [91]:
firebase_to_csv_log_setting('/settings_change_logs','settings_change_logs.csv')
settings_change_logs_data = pd.read_csv('/content/settings_change_logs.csv')
print("The shape of the settings_change_logs_data is " + str(settings_change_logs_data.shape))
settings_change_logs_data.to_csv(google_drive_data_path + '/settings_change_logs.csv', index=False)

The shape of the settings_change_logs_data is (21, 4)


## Ringer

In [79]:
def firebase_to_csv_log_mode(ref_path, output_file):
    ref = db.reference(ref_path)
    data = ref.get()
    transformed_data = []

    for pid, middle_levels in data.items():
        pid = pid.lower()
        for middle_key, sub_changes in middle_levels.items():
            for index, event in enumerate(sub_changes):
                if isinstance(event, dict):
                    row = {'pid': pid,'host':middle_key,'label': str(index),
                        'mode': event.get('mode'),
                        'timestamp': event.get('timestamp')}
                    transformed_data.append(row)

    df = pd.DataFrame(transformed_data)
    df = df[['pid', 'host', 'label', 'mode', 'timestamp']]
    df.to_csv(output_file, index=False)

In [80]:
firebase_to_csv_log_mode('/ringer_mode_events','ringer_mode_events.csv')
ringer_mode_events_data = pd.read_csv('/content/ringer_mode_events.csv')
print("The shape of the ringer_mode_events_data is " + str(ringer_mode_events_data.shape))
ringer_mode_events_data.to_csv(google_drive_data_path + '/ringer_mode_events.csv', index=False)

In [81]:
# ringer_mode_events_data0['label'] = ringer_mode_events_data0.groupby('pid').cumcount() + 1
# ringer_mode_events_data0

## Device info

In [82]:
# this function works for device and timezone infor
def firebase_to_csv(ref_path, output_file):
    # Get reference to device_info
    ref = db.reference(ref_path)
    data = ref.get()

    transformed_data = []

    # Transform the nested structure
    for pid, pid_data in data.items():
        pid = pid.lower()

        if isinstance(pid_data, str):
            # Case where timezone is directly under pid
            row = {
                'pid': pid,
                'host': pd.NA,  # or None
                'timezone': pid_data
            }
            transformed_data.append(row)
        else:
            # Case where pid has host-timezone pairs
            for host, timezone in pid_data.items():
                row = {
                    'pid': pid,
                    'host': host,
                    'timezone': timezone
                }
                transformed_data.append(row)

    # Convert to DataFrame
    df = pd.DataFrame(transformed_data)

    # Ensure pid and host are the first columns
    cols = ['pid', 'host'] + [col for col in df.columns if col not in ['pid', 'host']]
    df = df[cols]

    # Save to CSV
    df.to_csv(output_file, index=False)

In [83]:
firebase_to_csv('/device_info','device_info.csv')
device_data = pd.read_csv('/content/device_info.csv')
print("The shape of the device_data0 is " + str(device_data.shape))
device_data.to_csv(google_drive_data_path + '/device_info.csv', index=False)

## Timezone info

In [84]:
firebase_to_csv('/timezones','timezones.csv')
timezones_data = pd.read_csv('/content/timezones.csv')
print("The shape of the timezones_data is " + str(timezones_data.shape))
timezones_data.to_csv(google_drive_data_path + '/timezones_data.csv', index=False)

## Counters

We don't actually need top analyze the counter data

In [None]:
# def firebase_to_csv_counters(ref_path, output_file):
#     ref = db.reference(ref_path)
#     data = ref.get()
#     transformed_data = []
#     for pid, value in data.items():
#         pid = pid.lower()
#         # Case 1: only numeric
#         if isinstance(value, int):
#             transformed_data.append({'pid': pid, 'screen_event_count': value})

#         # Case 2: dictionary => selecting only value
#         elif isinstance(value, dict):
#             numeric_values = [v for v in value.values() if isinstance(v, int)]
#             if numeric_values:
#                 total = sum(numeric_values)
#                 transformed_data.append({'pid': pid, 'screen_event_count': total})

#     df = pd.DataFrame(transformed_data)
#     df = df[['pid', 'screen_event_count']]
#     df.to_csv(output_file, index=False)


# firebase_to_csv_counters('/ringer_event_counters','ringer_event_counters.csv')
# ringer_event_counters_data0 = pd.read_csv('/content/ringer_event_counters.csv')
# print(ringer_event_counters_data0.head())

# firebase_to_csv_counters('/screen_event_counters','screen_event_counters.csv')
# screen_event_counters_data0 = pd.read_csv('/content/screen_event_counters.csv')
# print(screen_event_counters_data0.head())

# firebase_to_csv_counters('/settings_change_counters','settings_change_counters.csv')
# settings_change_counters_data0 = pd.read_csv('/content/settings_change_counters.csv')
# print(settings_change_counters_data0.head())

# Qualtrics data

## Sign-up survey

In [85]:
import requests
import zipfile
import json
import time
import io

def get_qualtrics_data(api_token, survey_id):
    # API configurations
    base_url = f"https://pdx1.qualtrics.com/API/v3/surveys/{survey_id}/export-responses"
    headers = {
        "X-API-TOKEN": api_token,
        "Content-Type": "application/json"
    }

    # Start export
    export_payload = '{"format":"csv"}'
    export_response = requests.post(base_url, data=export_payload, headers=headers)
    export_progress_id = export_response.json()["result"]["progressId"]

    # Check export progress
    while True:
        progress_response = requests.get(
            f"{base_url}/{export_progress_id}",
            headers=headers
        )
        progress_status = progress_response.json()["result"]["status"]

        if progress_status == "complete":
            file_id = progress_response.json()["result"]["fileId"]
            break
        time.sleep(2)

    # Download file
    download_response = requests.get(
        f"{base_url}/{file_id}/file",
        headers=headers
    )

    # Extract zip file
    with zipfile.ZipFile(io.BytesIO(download_response.content)) as zip_file:
        return zip_file.read(zip_file.namelist()[0]).decode('utf-8')


In [86]:
sign_up_survey = "SV_dgN8IwiCIfglbAq"
api_token = "U5xGlZmJv76LsjIXvfwB7FS9RqrqwmMb3vva3pbD"

In [87]:
signup_survey_data = get_qualtrics_data(api_token, sign_up_survey)
# Then save to CSV
with open('signup_survey_data.csv', 'w') as f:
    f.write(signup_survey_data)
    f.close()

signup_survey_data = pd.read_csv('signup_survey_data.csv')
print("The shape of the signup_survey_data is " + str(signup_survey_data.shape))
signup_survey_data.to_csv(google_drive_data_path + '/signup_survey_data.csv', index=False)

## Consent form

In [88]:
consert_form_survey = "SV_1Y79vGshtWh9FPM"
consent_form_data = get_qualtrics_data(api_token, consert_form_survey)

with open('consent_form_data.csv', 'w') as f:
    f.write(consent_form_data)
    f.close()

consent_form_data = pd.read_csv('consent_form_data.csv')
print("The shape of the consent_form_data is " + str(consent_form_data.shape))
consent_form_data.to_csv(google_drive_data_path + '/consent_form_data.csv', index=False)

# Backup

Unused code. Don't run this cell unless you know what you are doing.

In [61]:
# Ringer mode changes, with mixed data structure. Hopefully we don't need this

# def firebase_to_csv_log_mode(ref_path, output_file):
#     ref = db.reference(ref_path)
#     data = ref.get()
#     transformed_data = []

#     for pid, middle_levels in data.items():
#         pid = pid.lower()
#         for middle_key, sub_changes in middle_levels.items():
#             # process: checking the structure of sub_data (dic or list)
#             if isinstance(sub_changes, dict):
#                 for sub_key, event in sub_changes.items():
#                     if isinstance(event, dict):
#                         row = {'pid': pid, 'host':middle_levels, 'label': sub_key,
#                             'mode': event.get('mode'),
#                             'timestamp': event.get('timestamp')}
#                         transformed_data.append(row)
#             elif isinstance(sub_changes, list):  # sub_changes -> list
#                 for index, event in enumerate(sub_changes):
#                     if isinstance(event, dict):
#                         row = {'pid': pid,'label': str(index),
#                             'mode': event.get('mode'),
#                             'timestamp': event.get('timestamp')}
#                         transformed_data.append(row)

#     df = pd.DataFrame(transformed_data)
#     df = df[['pid', 'label', 'mode', 'timestamp']]
#     df.to_csv(output_file, index=False)