In [None]:
## If using Google colab
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

In [None]:
import pandas as pd
import io
import json
import numpy as np

In [None]:
import scrape_project_definitions as spd
project_path = spd.project_path

In [None]:
input_filename='1_Batch_4329804_batch_results_V2.xlsx'
read_path = f"{project_path}/2_hits__club_lists/{input_filename}"
hit_redundancy = 3

output_filename = 'blast2_uni_directories_mod.xlsx'
write_path = f"{project_path}/3_hits__club_lists_labeled/{output_filename}"

## Read in MTurks dataframe and do preliminary reformatting and unstacking of data

In [None]:
df1=pd.read_excel(read_path)

# Present basic profiling information on df1
def inspect_dataframe_nulls(df1: pd.DataFrame) -> dict:
    nan_counts = df1.isnull().sum()
    total_nans = nan_counts.sum()
    total_rows = len(df1)

    profiling_info = {}
    if total_nans > 0:
        for col, count in nan_counts.items():
            if count > 0:
                profiling_info[col] = {}
                profiling_info[col]['null_count'] = count
                profiling_info[col]['null_percent'] = 100 * count/total_rows
    
    return profiling_info

df1_nulls = inspect_dataframe_nulls(df1)
df1_nulls

In [None]:
# Assignment ID is the row granularity
assert df1.shape[0] == len(pd.unique(df1['AssignmentId']))
df1 = df1.dropna(subset=['AssignmentId'])

In [None]:
# Fill the nulls with N/A to enable performing of operations
df1 = df1.fillna('N/A')
df1_nulls = inspect_dataframe_nulls(df1)
assert df1_nulls == {}

In [None]:
# Remove all unnecessary columns for coming analysis
general_project_columns = ['Title', 'Description', 'MaxAssignments', 'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds']
hit_columns = [
    'HITId',
    'AssignmentId',
    # 'NumberOfSimilarHITs', # I actually don't know what this is
    'WorkerId',
]
input_columns = [col for col in df1.columns if col.startswith('Input.')]
answer_columns = [col for col in df1.columns if col.startswith('Answer.')]


df2 = df1[hit_columns + input_columns + answer_columns]

assert inspect_dataframe_nulls(df2) == {}

In [None]:
# Unstack the data side by side and add copy_id label in [1,2,3]
# The resulting "primary key" will be (Input.university_name, Input.city, copy_id)

# Another summary attempt:
# There are some column names in col_names
# In the dataframe, there are multiple rows that contain the same set of strings in those column names
# For each of these same set groups, I want to get a view or a loc or a subdataframe of the dataframe so I can do some manipulation on one of the columns of this zoomed in subgroup.

# questions about this: what does group_keys = false do... need to learn more about each of the aspects of this calculation
df2.loc[:,'copy_id'] = -1
def modify_group(group):
    """Perform operations on each grouped subset."""
    # You can work with `group` as a DataFrame slice
    # Example: Assign unique copy_id within each group
    group['copy_id'] = range(1, len(group) + 1)  # Incrementing IDs
    
    return group  # Return modified group

df2 = df2.groupby(input_columns, group_keys=False).apply(modify_group)

# Ensure the max copy_id is exactly 3
assert df2['copy_id'].max() == 3, f"Max copy_id is {df2['copy_id'].max()}, expected 3"

# Check that each group has a copy_id == 3
for group_name, group in df2.groupby(input_columns):
    try:
        assert 1 in group['copy_id'].values
        assert 2 in group['copy_id'].values
        assert 3 in group['copy_id'].values
    except AssertionError:
        print(f"Assertion Failed: Group {group_name} is missing copy_id == 3")
        print(f"Group Indexes: {group.index.tolist()}")  # Print index of failing rows
        print(group)  # Print the problematic group for debugging


In [None]:
df2.shape

## Selecting the winning data and making flags for you to manually check

In [None]:
# Idea: you should confirm that the domain of directory link is what you are expecting...how? But all university website are formatted differently, some use external domains for their club directories.
# You must be able to quickly and automatically reject HITs if they are not good. Someone took advantage of you. It's the same worker. They could do this because you are basically approving everyone.
## Make a worker dataframe. 
# For each worker: 
# (1 count how many HITs they do) 
# (2 confirm they are not using the same link on more than one submission. Flag them if so. You will reject all of their HITs and block them.) 
# (3 for each HIT done by the worker, check the initial rating of the HIT (using my simple if/else method). Get the cumulative ratings of the HITs and compare to max possible rating, assign this worker rating to the worker. Use this worker trustworthiness rating to re-weight the redundant HITs, see if this forces a different automatic outcome. ) -- is this is a recursive thing? It reminds me of network eigencentrality ratings.
# (4 Assemble a global MTurk trustworthiness database based on all my projects. Blacklisted Turks, starred Turks.)

# You should include this information to the output you write and will manually check later. It makes it easier to trace back to find out what happened better. It's possible but slow and difficult to trace through everything
# In addition to what you already have, write the worker ID, the cumulative input info you gave them from the beginning of the process until now, and the link from the original spreadsheet where the HITs were stored (?) idk what this last thing is

# Also, you should make this for an arbitrary level of redundancy 

In [None]:
redundancy = 3 # the redundancy is built into this it will take alot of work to automate this for more or less redundancy
item = 'Answer.student_org_url'
anchor = '.'

for index, row in df2.iterrows():
  # print("index ",index)
  # print("row item")
  # print(row['HITId'][1])

  
  # Case 1: All 3 are the same
  #if (row['1'+infix+item] == row['2'+infix+item] and row['1'+infix+item] == row['3'+infix+item]):
  if (row[item][1] == row[item][2] and row[item][1] == row[item][3]):
    #if ( anchor in row['1'+infix+item]):
    if ( anchor in row[item][1]):
      #row['Winner_'+item] = row['1'+infix+item]
      df2.loc[index, ('results','winner_'+item)] = row[item][1]
      #row[item+'flag'] = 'All good, triple consensus'
      df2.loc[index,('results','flag_'+item)] = 'All good, triple consensus'
      #row['Email_1_list'] = ''
      df2.loc[index,('results','list_'+item)] = ''
    else:
      #row['Winner_'+item] = 'N/A'
      df2.loc[index,('results','winner_'+item)] = 'N/A'
      #row[item+'_flag'] = 'All good, triple consensus'
      df2.loc[index,('results','flag_'+item)] = 'All good, triple consensus'
      #row[item+'_list'] = ''
      df2.loc[index,('results','list_'+item)] = ''

### checkpoint

  # Case 2: All 3 are different
  elif (row[item][1] != row[item][2] and row[item][1] != row[item][3] and row[item][2] != row[item][3]):
    templist = []
    #print(row[item][1])
    #print(row[item][2])
    #print(row[item][3])
    #print(type(row[item][3]))

    if (anchor in row[item][1]):
      templist.append(row[item][1])

    if (anchor in row[item][2]):
      templist.append(row[item][2])

    if (anchor in row[item][3]):
      templist.append(row[item][3])

    if len(templist) == 0:
      # row['winner'+item]='N/A'
      df2.loc[index,('results','winner_'+item)] = 'N/A'
      # row['flag_'+item] = 'All good, zero value'
      df2.loc[index,('results','flag_'+item)] = 'All good, zero value'
      # row['list_'+item] = ''
      df2.loc[index,('results','list_'+item)] = ''

    if len(templist) == 1:
      # row['winner'+item]=templist[0]
      df2.loc[index,('results','winner_'+item)] = templist[0]
      # row['flag'+item] = 'Checkit, minority'
      df2.loc[index,('results','flag_'+item)] = 'Checkit, minority'
      # row['list'+item] = ''
      df2.loc[index,('results','list_'+item)] = ''

    if len(templist) == 2:
      # row['winner'+item] = ",".join(templist)
      df2.loc[index,('results','winner_'+item)] = ",".join(templist)
      # row['flag'+item] = 'Checkit, multiple emails'
      df2.loc[index,('results','flag_'+item)] = 'Checkit, multiple emails'
      # row['list'+item] = ''
      df2.loc[index,('results','list_'+item)] = ''
  
    if len(templist) == 3:
      df2.loc[index,('results','winner_'+item)] = ",".join(templist)
      df2.loc[index,('results','flag_'+item)] = 'Checkit, multiple emails'
      df2.loc[index,('results','list_'+item)] = ''
      
## Checkpoint


  # Case 3: 2 are the same and 1 is different
  #elif row['1'+infix+item] == row['2'+infix+item]: # implicitly, the odd one out is not equal
  elif row[item][1] == row[item][2]: # implicitly, the odd one out is not equal
    # if (anchor in row['1'+infix+item]): # there is an anchor in the overlap
    if (anchor in row[item][1]): # there is an anchor in the overlap
      if (anchor in row[item][3]): # if there is an anchor in the odd one out, we should take a look
        #row['winner'+item] = ",".join([row[item][1],row[item][3]]) #row['1'+infix+item],row['3'+infix+item]])
        df2.loc[index,('results','winner_'+item)] = ",".join([row[item][1],row[item][3]]) #row['1'+infix+item],row['3'+infix+item]])
        # row['flag'+item] = 'Checkit, multiple emails'
        df2.loc[index,('results','flag_'+item)] = 'Checkit, multiple emails'
        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

      else: # there is no anchor in the odd one out
        # row['winner'+item] = row['1'+infix+item]
        df2.loc[index,('results','winner_'+item)] = row[item][1]
        #row['flag'+item] = 'All good, double consensus'
        df2.loc[index,('results','flag_'+item)] = 'All good, double consensus' 
        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''
    
    else: # there is no anchor in the two that overlap
      if anchor in row[item][3]: #row['3'+infix+item]: # if there is an anchor in the odd one out, we should take a look
        # row['winner'+item] = row['3'+infix+item]
        df2.loc[index,('results','winner_'+item)] = row[item][3]
        # row['flag'+item] = 'Checkit, minority'
        df2.loc[index,('results','flag_'+item)] = 'Checkit, minority'
        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

      else: # there is no anchor in the odd one out or the overlaps
        # row['winner'+item] = 'N/A'
        df2.loc[index,('results','winner_'+item)] = 'N/A'
        # row['flag'+item] = 'All good, zero value'
        df2.loc[index,('results','flag_'+item)] = 'All good, zero value'
        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''
    # print('hi')
  
## Checkpoint    
  # elif (row['1'+infix+item] == row['3'+infix+item]):
  elif (row[item][1] == row[item][3]):
  
    if (anchor in row[item][1]): # there is an anchor in the overlap
      if (anchor in row[item][2]): # if there is an anchor in the odd one out, we should take a look
        # row['winner'+item] = ','.join([row['1'+infix+item],row['2'+infix+item]])
        df2.loc[index,('results','winner_'+item)] = ','.join([row[item][1],row[item][2]])
        # row['flag'+item] = 'Checkit, multiple emails'
        df2.loc[index,('results','flag_'+item)] = 'Checkit, multiple emails'
        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''
      else: # there is no anchor in the odd one out
        # row['winner'+item] = row['1'+infix+item]
        df2.loc[index,('results','winner_'+item)] = row[item][1]
        
        #row['flag'+item] = 'All good, double consensus'
        df2.loc[index,('results','flag_'+item)] = 'All good, double consensus'
        
        #row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

    else: # there is no anchor in the two that overlap
      if anchor in row[item][2]: # if there is an @ in the odd one out, we should take a look
        
        # row['winner'+item] = row['2'+infix+item]
        df2.loc[index,('results','winner_'+item)] = row[item][2]
        
        # row['flag'+item] = 'Checkit, minority'
        df2.loc[index,('results','flag_'+item)] = 'Checkit, minority'
        
        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

      else: # there is no @ in the odd one out or the overlaps
        # row['winner'+item] = 'N/A'
        df2.loc[index,('results','winner_'+item)] = 'N/A'

        # row['flag'+item] = 'All good, zero value'
        df2.loc[index,('results','flag_'+item)] = 'All good, zero value'

        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

    # print('ho')

## Checkpoint
  #elif (row['2'+infix+item] == row['3'+infix+item]):
  elif (row[item][2] == row[item][3]):
    
    #if (anchor in row['2'+infix+item]): # there is an anchor in the overlap
    if (anchor in row[item][2]): # there is an anchor in the overlap

      #if (anchor in row['1'+infix+item]): # if there is an anchor in the odd one out, we should take a look
      if (anchor in row[item][1]): # if there is an anchor in the odd one out, we should take a look
        #row['winner'+item] = ','.join([row['2'+infix+item],row['1'+infix+item]])
        df2.loc[index,('results','winner_'+item)] = ','.join([row[item][2],row[item][1]])

        #row['flag'+item] = 'Checkit, multiple emails'
        df2.loc[index,('results','flag_'+item)] = 'Checkit, multiple emails'

        #row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

      else: # there is no anchor in the odd one out
        # row['winner'+item] = row['2'+infix+item]
        df2.loc[index,('results','winner_'+item)] = row[item][2]
        
        # row['flag'+item] = 'All good, double consensus'
        df2.loc[index,('results','flag_'+item)] = 'All good, double consensus'

        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

## checkpoint    
    else: # there is no anchor in the two that overlap
      #if anchor in row['1'+infix+item]: # if there is an anchor in the odd one out, we should take a look
      if anchor in row[item][1]: #['1'+infix+item]: # if there is an anchor in the odd one out, we should take a look
        #row['winner'+item] = row['1'+infix+item]
        df2.loc[index,('results','winner_'+item)] = row[item][1]

        # row['flag'+item] = 'Checkit, minority'
        df2.loc[index,('results','flag_'+item)] = 'Checkit, minority'

        # row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

      else: # there is no anchor in the odd one out or the overlaps
        #row['winner'+item] = 'N/A'
        df2.loc[index,('results','winner_'+item)] = 'N/A'

        #row['flag'+item] = 'All good, zero value'
        df2.loc[index,('results','flag_'+item)] = 'All good, zero value'

        #row['list'+item] = ''
        df2.loc[index,('results','list_'+item)] = ''

df2 = df2.fillna('N/A')


In [None]:
df2.head()

## Export the dataframe

In [None]:
df2.to_excel(write_path)