# MTurk annotations

In [1]:
import pandas as pd
import numpy as np
from random import randint

In [2]:
def read_results_file(dir_file,city):
    #Defining labels
    labels=['Dangerous', 'Dirty', 'Pretty', 'Preserved', 'Accessible', 'Interesting','Picturesque', 'Wealthy', 'Quiet', 'Polluted', 'Pleasant', 'Happy']
    #Reading original file
    df_mturk=pd.read_csv(dir_file,delim_whitespace=True)
    #Renaming columns: Answer.Ax with actual label and the annotation to annotation_url
    columns_to_rename={"Answer.A"+str(i+1):x for i,x in enumerate(labels)}
    columns_to_rename.update({"annotation":"annotation_url"})
    df_mturk.rename(columns=columns_to_rename,inplace=True)
    #Getting the image name
    df_mturk["annotation"]=df_mturk["annotation_url"].str.split("/",expand=True)[6]
    #Returning the ordered dataframe
    df_mturk["city"]=city
    return df_mturk[["annotation"]+labels+["city"]]
    

In [3]:
dir_mturk_gc="/Users/lemr/Documents/EPFL/Tercer_Semestre/COM-405_Optional_project_in_communication_systems/datasets/MTurk_annotations/mturk-results/hit-gc.results"
dir_mturk_sc="/Users/lemr/Documents/EPFL/Tercer_Semestre/COM-405_Optional_project_in_communication_systems/datasets/MTurk_annotations/mturk-results/hit-sc.results"
dir_mturk_lc="/Users/lemr/Documents/EPFL/Tercer_Semestre/COM-405_Optional_project_in_communication_systems/datasets/MTurk_annotations/mturk-results/hit-lc.results"

In [4]:
df_mturk_gc=read_results_file(dir_mturk_gc,"gc")
df_mturk_sc=read_results_file(dir_mturk_sc,"sc")
df_mturk_lc=read_results_file(dir_mturk_lc,"lc")

## Getting individual annotations by label

In [6]:
def get_DataFrame_label(df,label,image_column,num_raters):
    #Grouping by the images
    df_label=df.groupby(image_column,as_index=True).apply(lambda x: list(x[label])).reset_index()
    df_label.rename(columns={0:"list_rates"},inplace=True)
    #Getting rates
    rates = [df_label, pd.DataFrame(df_label["list_rates"].tolist()).iloc[:, :num_raters]]
    #Getting each rating in a different column
    df_label=pd.concat(rates, axis=1).drop(["list_rates"], axis=1)
    #Renaming the columns
    df_label.rename(columns={i:"rater_"+str(i+1) for i in range(0,num_raters)},inplace=True)
    df_label.to_csv("datasets/MTurk_annotations/mturk_gto_hits_"+str(label)+".csv",index=False)
    print("csv file generated in: "+"datasets/MTurk_annotations/mturk_gto_hits_"+str(label)+".csv")
    df_label["median"]=df_label.drop(image_column,axis=1).median(axis=1)
    print("\n")
    print("Value counts for median:")
    print("mean: "+str(round(df_label["median"].mean(),2)))
    print("std: "+str(round(df_label["median"].std(),2)))
    print("\n")
    
    return df_label

In [7]:
df_all_cities=df_mturk_gc.append([df_mturk_sc,df_mturk_lc],ignore_index=True)

In [7]:
#Dangerous
df_dangerous=get_DataFrame_label(df=df_all_cities,label="Dangerous",image_column="annotation",num_raters=10)
#Dirty
df_dirty=get_DataFrame_label(df=df_all_cities,label="Dirty",image_column="annotation",num_raters=10)
#Pretty
df_pretty=get_DataFrame_label(df=df_all_cities,label="Pretty",image_column="annotation",num_raters=10)
#Preserved
df_preserved=get_DataFrame_label(df=df_all_cities,label="Preserved",image_column="annotation",num_raters=10)
#Accessible
df_accessible=get_DataFrame_label(df=df_all_cities,label="Accessible",image_column="annotation",num_raters=10)
#Interesting
df_interesting=get_DataFrame_label(df=df_all_cities,label="Interesting",image_column="annotation",num_raters=10)
#Picturesque
df_picturesque=get_DataFrame_label(df=df_all_cities,label="Picturesque",image_column="annotation",num_raters=10)
#Wealthy
df_wealthy=get_DataFrame_label(df=df_all_cities,label="Wealthy",image_column="annotation",num_raters=10)
#Quiet
df_quiet=get_DataFrame_label(df=df_all_cities,label="Quiet",image_column="annotation",num_raters=10)
#Polluted
df_polluted=get_DataFrame_label(df=df_all_cities,label="Polluted",image_column="annotation",num_raters=10)
#Pleasant
df_pleasant=get_DataFrame_label(df=df_all_cities,label="Pleasant",image_column="annotation",num_raters=10)
#Happy
df_happy=get_DataFrame_label(df=df_all_cities,label="Happy",image_column="annotation",num_raters=10)

csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Dangerous.csv


Value counts for median:
mean: 2.98
std: 0.99


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Dirty.csv


Value counts for median:
mean: 3.16
std: 1.06


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Pretty.csv


Value counts for median:
mean: 3.11
std: 1.02


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Preserved.csv


Value counts for median:
mean: 3.84
std: 1.05


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Accessible.csv


Value counts for median:
mean: 4.69
std: 0.94


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Interesting.csv


Value counts for median:
mean: 3.84
std: 0.94


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Picturesque.csv


Value counts for median:
mean: 3.09
std: 1.06


csv file generated in: datasets/MTurk_annotations/mturk_gto_hits_Wealthy.csv


Value counts for med

### Getting random (5) annotations by label

We will only consider the labels that were used in the CECYTE's crowdsourcing experiment:
* Dangerous
* Dirty
* Pretty
* Interesting
* Polluted
* Pleasant

For this, we will take 5 random annotations for each image and do this 10 times, in order to get an average, so that we can compare CECYTE's results (these only have 5 raters) with Mturk 

In [178]:
def get_random_raters(df,num_raters,num_annotations,num_samples,image_column,label):
    flag=0
    for i in range(num_samples):
        ########################
        #Getting random raters:#
        ######################## 
        raters=[]
        count=0
        for j in range(num_annotations):
            rand_rater=randint(1,num_annotations)
            if "rater_"+str(rand_rater) not in raters[:]:
                raters.append("rater_"+str(rand_rater))
                count+=1
            if count==num_raters:
                flag+=1
                break
        ##########################################################
        #Getting a new csv file for each set of the random raters#
        ##########################################################
        df[[image_column]+raters].to_csv("datasets/MTurk_annotations/"+str(label)+"/mturk_gto_hits_"+str(label)+"_"+str(i+1)+".csv"
                                        ,index=False)
    if flag==num_samples: print("Number of raters reached correctly\n")
    print("csv files generated at: "+ "datasets/MTurk_annotations/"+str(label)+"/")
    print("Done")

In [179]:
#Dangerous
get_random_raters(df_dangerous,5,10,10,"annotation","Dangerous")

Number of raters reached correctly

csv files generated at: datasets/MTurk_annotations/Dangerous/
Done


In [180]:
#Dirty
get_random_raters(df_dirty,5,10,10,"annotation","Dirty")

Number of raters reached correctly

csv files generated at: datasets/MTurk_annotations/Dirty/
Done


In [183]:
#Pretty
get_random_raters(df_pretty,5,10,10,"annotation","Pretty")

Number of raters reached correctly

csv files generated at: datasets/MTurk_annotations/Pretty/
Done


In [186]:
#Interesting
get_random_raters(df_interesting,5,10,10,"annotation","Interesting")

Number of raters reached correctly

csv files generated at: datasets/MTurk_annotations/Interesting/
Done


In [188]:
#Polluted
get_random_raters(df_polluted,5,10,10,"annotation","Polluted")

Number of raters reached correctly

csv files generated at: datasets/MTurk_annotations/Polluted/
Done


In [189]:
#Pleasant
get_random_raters(df_pleasant,5,10,10,"annotation","Pleasant")

Number of raters reached correctly

csv files generated at: datasets/MTurk_annotations/Pleasant/
Done


### Getting a consolidated Dataframe with 5 annotations per image.

In [16]:
#Getting a consolidated Dataframe with 5 annotations per image.
labels_6=["Dangerous","Dirty","Pretty","Interesting","Polluted","Pleasant"]
files=["datasets/MTurk_annotations/"+str(label)+"/mturk_gto_hits_"+str(label)+"_1.csv" for label in labels_6]

In [24]:
df_dangerous_5=pd.read_csv(files[0])
df_dirty_5=pd.read_csv(files[1])
df_pretty_5=pd.read_csv(files[2])
df_interesting_5=pd.read_csv(files[3])
df_polluted_5=pd.read_csv(files[4])
df_pleasant_5=pd.read_csv(files[5])

In [30]:
df_dangerous_5["median"]=df_dangerous_5.median(axis=1)
df_dirty_5["median"]=df_dirty_5.median(axis=1)
df_pretty_5["median"]=df_pretty_5.median(axis=1)
df_interesting_5["median"]=df_interesting_5.median(axis=1)
df_polluted_5["median"]=df_polluted_5.median(axis=1)
df_pleasant_5["median"]=df_pleasant_5.median(axis=1)

In [33]:
df_mturk_all_labels=pd.DataFrame()
df_mturk_all_labels["annotation"]=df_dangerous_5["annotation"].copy()

#Dangerous
df_mturk_all_labels=pd.merge(df_mturk_all_labels,df_dangerous_5[["annotation","median"]],
                              how="inner",on="annotation").rename(columns={"median":"Dangerous"})
#Dirty
df_mturk_all_labels=pd.merge(df_mturk_all_labels,df_dirty_5[["annotation","median"]],
                              how="inner",on="annotation").rename(columns={"median":"Dirty"})
#Pretty
df_mturk_all_labels=pd.merge(df_mturk_all_labels,df_pretty_5[["annotation","median"]],
                              how="inner",on="annotation").rename(columns={"median":"Pretty"})
#Interesting
df_mturk_all_labels=pd.merge(df_mturk_all_labels,df_interesting_5[["annotation","median"]],
                              how="inner",on="annotation").rename(columns={"median":"Interesting"})
#Polluted
df_mturk_all_labels=pd.merge(df_mturk_all_labels,df_polluted_5[["annotation","median"]],
                              how="inner",on="annotation").rename(columns={"median":"Polluted"})
#Pleasant
df_mturk_all_labels=pd.merge(df_mturk_all_labels,df_pleasant_5[["annotation","median"]],
                              how="inner",on="annotation").rename(columns={"median":"Pleasant"})

In [35]:
df_mturk_all_labels.to_csv("datasets/MTurk_annotations/mturk_6_labels_5_annotators.csv",index=False)