In [5]:
# import packages
import os
import glob
import numpy as np
import pandas as pd
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from firebase_admin import auth

### Add user input
Fill in the variables in the cell below with the following information

- `experiment` = name of experiment (outermost collection) in Firebase
- `local_folder` = name of folder you want created locally and populated with data
- `credential_path` = path to .json Firebase SDK service account key (explained [here](https://firebase.google.com/docs/admin/setup))
- `firebase_project_name` = name of project in Firebase (click Project Overview in the Firebase console to see name)
- `groups` = array of group names (as set by userGroup in src/utils.js and found as collections in the 'subject' document in your Firebase), each enclosed in single quotes and separated by commas
- `rating_types` = array of rating types, each enclosed in single quotes and separated by commas (same as ratingTypes from src/utils.js)
- `movie_names` = array of movie names, as written in Firebase stimuli table (no spaces), each enclosed in single quotes and separated by commas
- `vid_lens` = dictionary of video durations, where each entry contains the movie name as key and the movie duration (in seconds) as value (e.g. {'movie1': 100, 'movie2': 150, 'movie3': 120}) 


In [6]:
# USER VARIABLES (FILL IN HERE)
experiment = 'wasabi-online'
local_folder = '/Users/jasondavis/Documents/MATLAB/wasabi-online'
credential_path = '/Users/jasondavis/Documents/Downloads/continuous-rater-jad-firebase-adminsdk-2ie9g-0080d92b8a.json'
firebase_project_name = 'continuous-rater-jad'
groups = ['mTurk Group'] # possible to only have on group, or can use multiple for organizational purposes

rating_types = ['pleasant', 'unpleasant', 'calm', 'aroused', 'funny', 'happy', 'angry', 'sad', 'disgusted', 'afraid', 'suprised']
movie_names = ['KungFuryPart1', 'KungFuryPart2']
vid_lens = {'KungFuryPart1': 931, 'KungFuryPart2': 931} # example values, fill in with your own

### Set up communication with Firebase

In [7]:
cred = credentials.Certificate(credential_path)
firebase_admin.initialize_app(cred, {
  'projectId': firebase_project_name,
})

db = firestore.client()

### Initialize data structures and directories

In [8]:
# initialize data structures
subject_rating_dict = {}
movie_q_count_dict = {}
sub_to_mov = {}
mov_to_sub = {}

# initialize directory structure
directories = []
directories.append(f'{local_folder}/')
directories.append(f'{local_folder}/Long/')
directories.append(f'{local_folder}/Blanks/')
directories.append(f'{local_folder}/Subjects/')
directories.append(f'{local_folder}/Subjects/Incomplete/')
directories.append(f'{local_folder}/Subjects/Groups/')
directories.append(f'{local_folder}/Summary/')
directories.append(f'{local_folder}/Ratings/')
for movie in movie_names:
    for rating in rating_types:
        combo = movie + "-" + rating
        movie_q_count_dict[combo] = 0
        mov_to_sub[combo] = []
        directories.append(f'{local_folder}/Ratings/{combo}/')
        
for directory in directories:
    if not os.path.isdir(directory):
        os.mkdir(directory) 

### Create blank pandas dataframes of proper shape to fill in with rating data

In [9]:
# use provided movie durations to create blank dataframe of proper length for each movie
blanks_path = f'{local_folder}/Blanks/'
for movie in movie_names:
    curr_vid_len = vid_lens[movie] + 3 # add buffer time
    init_ratings = np.full(curr_vid_len, -1)
    curr_df = pd.DataFrame({'rating': init_ratings})
    curr_df.to_csv(os.path.join(blanks_path, f'{movie}.csv')) # write out to file

In [11]:
# read blank dataframes back in and store in dictionary
blanks_path = f'{local_folder}/Blanks/'
blank_pd_dict = {}
directory_list = glob.glob(os.path.join(blanks_path, '*.csv'))
for file in directory_list:
    curr_df = pd.read_csv(file)
    drop_df = curr_df.drop(labels='Unnamed: 0', axis=1)
    movie = os.path.basename(file).split('.')[0]
    blank_pd_dict[movie] = drop_df

### Determine which subjects completed task, and save subject info to .csv files

In [12]:
# create good and bad subject lists (based on completion)
good_id_master_list = []
bad_id_master_list = []

for group in groups:
    good_id_list = [] # stores participants who completed HIT
    bad_id_list = [] # stores participants who started but didn't complete HIT
    sub_list = []

    group_path = f'{experiment}/subjects/{group}'
    group_collection = db.collection(group_path)
    group_subs = group_collection.stream()

    for sub in group_subs:
        sub_dict = sub.to_dict()
        if 'currentState' in sub_dict: # check to see if subject even started HIT
            if sub_dict['currentState'] == 'debrief' or 'HIT_complete' in sub_dict: # only keep subs who finished
                good_id_list.append(sub.id)
                good_id_master_list.append(sub.id)
                curr = pd.Series(sub_dict)
                sub_list.append(curr)
                file_path = f'{local_folder}/Subjects/{sub.id}.csv'
                curr.to_csv(file_path)
        else:
            bad_id_list.append(sub.id)
            bad_id_master_list.append(sub.id)
            curr = pd.Series(sub_dict)
            file_path = f'{local_folder}/Subjects/Incomplete/{sub.id}.csv'
            curr.to_csv(file_path)

    group_df = pd.DataFrame(sub_list)
    group_df.to_csv(f'{local_folder}/Subjects/Groups/{group}.csv')

I0000 00:00:1730915693.768898  483025 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1730915693.793506  483025 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


### Get all ratings from subjects who completed task and store locally

In [13]:
# loops over subjects and gets ratings from database to store in local dictionary
# also counts number of subjects that have rated each question
# links subjects to movies and movies to subjects
good_id_set = set(good_id_master_list) # removes repeats

for good_id in good_id_set: 
    sub_to_mov[good_id] = []
    movie_list = []
    collection_path = f'{experiment}/ratings/{good_id}'
    curr_sub_ratings = db.collection(collection_path)
    HITs = curr_sub_ratings.stream()

    curr_movie_dict = {}
    for HIT in HITs:
        sub_to_mov[good_id].append(HIT.id)
        mov_to_sub[HIT.id].append(good_id)
        curr_movie_dict[HIT.id] = HIT.to_dict()
        movie_q_count_dict[HIT.id] += 1

    if good_id in subject_rating_dict:
        updated = subject_rating_dict[good_id].append(curr_movie_dict)
        subject_rating_dict[good_id] = updated
    else:
        movie_list.append(curr_movie_dict)
        subject_rating_dict[good_id] = movie_list

movie_counts = pd.Series(movie_q_count_dict)
movie_counts.to_csv(f'{local_folder}/Summary/Movie_Counts.csv')

### Write out rating dictionary to .csv rating files

In [14]:
# writes file containing rating-timestamp pairs for each subject to folder for each movie-rating pairing
for good_id in good_id_set:
    curr_sub = subject_rating_dict[good_id]
    for dictionary in curr_sub:
        for movie_rating in dictionary:
            words = movie_rating.split('-')
            blank_mov_pd = blank_pd_dict[words[0]].copy()
            rating_dict = dictionary[movie_rating]
            for timestamp in rating_dict:
                blank_mov_pd.iloc[int(timestamp)] = rating_dict[timestamp]  
            file_path = f'{local_folder}/Ratings/{movie_rating}/{good_id}.csv'
            blank_mov_pd.to_csv(file_path)      

### Combine subject and rating info together to create long format file

In [15]:
# make the base frame for appending
cols = ['workerId', 'movie', 'ratingType', 'HIT_complete', 'age', 'assignmentId', 'birth', 'consentStatus', 'currentState', \
'ethnicity', 'feedback', 'handed', 'hitId', 'nativeLang', 'race', 'sex', 'startTime', \
'userId', 'mostRecentTime', 'timeStamp', 'ratingScore']

base_frame = pd.DataFrame(np.nan, index=[0], columns=cols)

In [16]:
# this take individual local csvs and turns them into long format per movie per rating per subject
dir_list = glob.glob(f'{local_folder}/Ratings/*')
master_long = base_frame.copy()

for directory in dir_list:
    path = directory + '/*.csv'
    rating_list = glob.glob(path)
    for file in rating_list:
        sub_id = os.path.basename(file).split('.')[0]
        movie_rating = file.split('/')[2]
        movie = movie_rating.split('-')[0]
        rating = movie_rating.split('-')[0]       

        # this should check and not rewrite files that already exist (speeds up process)
        if not os.path.isfile(f'{local_folder}/Long/{movie}-{rating}-{sub_id}.csv'):
            subject_long = base_frame.copy()
            rating_pd = pd.read_csv(file)
            sub_path = f'{local_folder}/Subjects/{sub_id}.csv'
            if os.path.isfile(sub_path):
                sub_pd = pd.read_csv(sub_path)
                title_list = sub_pd['Unnamed: 0'].values
                rename_dict = {}
                counter = 0
                for title in title_list:
                    rename_dict[counter] = title
                    counter += 1   
                new_sub_pd = sub_pd.transpose().rename(columns=rename_dict).drop(['Unnamed: 0'])

                new_pd = base_frame.copy()
                new_pd['movie'] = movie
                new_pd['ratingType'] = rating

                for category in base_frame:
                    if category in new_sub_pd:
                        new_pd[category] = new_sub_pd[category].values

                timestamp_dict = rating_pd.transpose().drop(['Unnamed: 0'])
                copy_pd = new_pd.copy()
                prevScore = -1
                rating_counter = 0
                for timestamp in timestamp_dict:
                    ratingScore = timestamp_dict[timestamp].values
                    if ratingScore != -1:
                        prevScore = ratingScore
                    else:
                        ratingScore = prevScore

                    copy_pd['timeStamp'] = timestamp
                    copy_pd['ratingScore'] = ratingScore
                    subject_long = pd.concat([subject_long, copy_pd], ignore_index=True)

                subject_long = subject_long.drop([0])
                subject_long.to_csv(f'{local_folder}/Long/{movie}-{rating}-{sub_id}.csv')                   

In [17]:
# this appends all individual long format files into one giant long format panda
master_long = base_frame.copy()
long_list = glob.glob(f'{local_folder}/Long/*.csv')

for file in long_list:
    curr_pd = pd.read_csv(file)
    master_long = pd.concat([master_long, curr_pd])

master_long.drop([0])
master_long.to_csv(f'{local_folder}/master_long.csv')

In [18]:
# read in saved copy
master_long = pd.read_csv(f'{local_folder}/master_long.csv')
master_long.drop(labels=[0], inplace=True)
master_long.drop(labels=['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
sub_list = master_long['workerId'].unique()
movie_list = master_long['movie'].unique()
rating_list = master_long['ratingType'].unique()
movie = movie_list[0]
rating = rating_list[0]

In [19]:
master_long

Unnamed: 0,workerId,movie,ratingType,HIT_complete,age,assignmentId,birth,consentStatus,currentState,ethnicity,...,handed,hitId,nativeLang,race,sex,startTime,userId,mostRecentTime,timeStamp,ratingScore
1,936000037.0,jasondavis,jasondavis,2024-10-28 17:08:08.215000+00:00,33.0,936000037.0,greece,signed,complete,not_hispanic,...,right,936000037.0,greek,['White / Caucasian'],male,2024-10-28 17:08:08.215000+00:00,HBDOOqo5RWe5pUSBW9h7e69EboI3,,0.0,50.0
2,936000037.0,jasondavis,jasondavis,2024-10-28 17:08:08.215000+00:00,33.0,936000037.0,greece,signed,complete,not_hispanic,...,right,936000037.0,greek,['White / Caucasian'],male,2024-10-28 17:08:08.215000+00:00,HBDOOqo5RWe5pUSBW9h7e69EboI3,,1.0,50.0
3,936000037.0,jasondavis,jasondavis,2024-10-28 17:08:08.215000+00:00,33.0,936000037.0,greece,signed,complete,not_hispanic,...,right,936000037.0,greek,['White / Caucasian'],male,2024-10-28 17:08:08.215000+00:00,HBDOOqo5RWe5pUSBW9h7e69EboI3,,2.0,50.0
4,936000037.0,jasondavis,jasondavis,2024-10-28 17:08:08.215000+00:00,33.0,936000037.0,greece,signed,complete,not_hispanic,...,right,936000037.0,greek,['White / Caucasian'],male,2024-10-28 17:08:08.215000+00:00,HBDOOqo5RWe5pUSBW9h7e69EboI3,,3.0,50.0
5,936000037.0,jasondavis,jasondavis,2024-10-28 17:08:08.215000+00:00,33.0,936000037.0,greece,signed,complete,not_hispanic,...,right,936000037.0,greek,['White / Caucasian'],male,2024-10-28 17:08:08.215000+00:00,HBDOOqo5RWe5pUSBW9h7e69EboI3,,4.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18676,477182049.0,jasondavis,jasondavis,2024-10-26 16:37:31.553000+00:00,42.0,477182049.0,Italy,signed,complete,not_hispanic,...,right,477182049.0,Italian,['White / Caucasian'],male,2024-10-26 16:37:31.553000+00:00,JmTcZqgMJse6BwwYmY8P03yOh853,,929.0,74.0
18677,477182049.0,jasondavis,jasondavis,2024-10-26 16:37:31.553000+00:00,42.0,477182049.0,Italy,signed,complete,not_hispanic,...,right,477182049.0,Italian,['White / Caucasian'],male,2024-10-26 16:37:31.553000+00:00,JmTcZqgMJse6BwwYmY8P03yOh853,,930.0,74.0
18678,477182049.0,jasondavis,jasondavis,2024-10-26 16:37:31.553000+00:00,42.0,477182049.0,Italy,signed,complete,not_hispanic,...,right,477182049.0,Italian,['White / Caucasian'],male,2024-10-26 16:37:31.553000+00:00,JmTcZqgMJse6BwwYmY8P03yOh853,,931.0,74.0
18679,477182049.0,jasondavis,jasondavis,2024-10-26 16:37:31.553000+00:00,42.0,477182049.0,Italy,signed,complete,not_hispanic,...,right,477182049.0,Italian,['White / Caucasian'],male,2024-10-26 16:37:31.553000+00:00,JmTcZqgMJse6BwwYmY8P03yOh853,,932.0,74.0
