In [3]:
import pandas as pd
import numpy as np

In [4]:
owner_df = pd.read_stata("slaveowner_pages_expanded.dta")

In [5]:
owner_df.head()

Unnamed: 0,year,owner_id,state,county,lastname,firstname,image_url
0,1850,470,Alabama,Autauga,Wyatte,Robt,http://textlab.econ.columbia.edu/~jjacobs/mtur...
1,1850,470,Alabama,Autauga,Wyatte,Robt,http://textlab.econ.columbia.edu/~jjacobs/mtur...
2,1850,126,Alabama,Autauga,Wyatte,Ann T R,http://textlab.econ.columbia.edu/~jjacobs/mtur...
3,1850,126,Alabama,Autauga,Wyatte,Ann T R,http://textlab.econ.columbia.edu/~jjacobs/mtur...
4,1850,32,Alabama,Autauga,Taylor,,http://textlab.econ.columbia.edu/~jjacobs/mtur...


In [6]:
ysci_groups = owner_df.groupby(["year","state","county","image_url"], as_index=False)

In [7]:
all_images = []
for cur_group, group_data in ysci_groups:
    all_images.append(list(cur_group))

In [8]:
image_df = pd.DataFrame(all_images, columns=['year','state','county','image_url'])

In [9]:
image_df

Unnamed: 0,year,state,county,image_url
0,1850,Alabama,Autauga,http://textlab.econ.columbia.edu/~jjacobs/mtur...
1,1850,Alabama,Autauga,http://textlab.econ.columbia.edu/~jjacobs/mtur...
2,1850,Alabama,Autauga,http://textlab.econ.columbia.edu/~jjacobs/mtur...
3,1850,Alabama,Autauga,http://textlab.econ.columbia.edu/~jjacobs/mtur...
4,1850,Alabama,Autauga,http://textlab.econ.columbia.edu/~jjacobs/mtur...
...,...,...,...,...
24819,1860,Virginia,Westmoreland,http://textlab.econ.columbia.edu/~jjacobs/mtur...
24820,1860,Virginia,Westmoreland,http://textlab.econ.columbia.edu/~jjacobs/mtur...
24821,1860,Virginia,Westmoreland,http://textlab.econ.columbia.edu/~jjacobs/mtur...
24822,1860,Virginia,Westmoreland,http://textlab.econ.columbia.edu/~jjacobs/mtur...


In [10]:
image_df.to_pickle("unique_images.pkl")

Now merge in the mturk results for each image

In [11]:
hit_df = pd.read_stata("image_results.dta")

In [12]:
hit_df.head()

Unnamed: 0,image_id,image_tags,num_yes,num_no,num_dontknow,image_url,origin_file
0,1.0,2.0,0.0,2.0,0.0,http://textlab.econ.columbia.edu/~jjacobs/mtur...,./Results_2016-07-06/Batch_results_FINAL_known...
1,2.0,4.0,0.0,4.0,0.0,http://textlab.econ.columbia.edu/~jjacobs/mtur...,./Results_2016-07-06/Batch_results_FINAL_known...
2,3.0,2.0,0.0,2.0,0.0,http://textlab.econ.columbia.edu/~jjacobs/mtur...,./Results_2016-07-06/Batch_results_FINAL_known...
3,4.0,2.0,0.0,2.0,0.0,http://textlab.econ.columbia.edu/~jjacobs/mtur...,./Results_2016-07-06/Batch_results_FINAL_known...
4,5.0,2.0,0.0,2.0,0.0,http://textlab.econ.columbia.edu/~jjacobs/mtur...,./Results_2016-06-07/Batch_Results_Final_2016-...


In [13]:
merged_df = image_df.merge(hit_df, on=["image_url"], how="left", indicator=True)

In [14]:
merged_df["_merge"].value_counts()

both          23678
left_only      1146
right_only        0
Name: _merge, dtype: int64

In [15]:
finished_df = merged_df[merged_df["_merge"] == "both"].copy()

In [16]:
finished_df.drop(columns=['image_id','origin_file','_merge'], inplace=True)

In [17]:
finished_df.rename(columns={'image_tags':'num_tags'}, inplace=True)

In [18]:
finished_df.to_csv("page_tags_finished.csv", index=False)

In [19]:
full_df = merged_df.drop(columns=['image_id','origin_file','_merge'])

In [20]:
full_df.rename(columns={'image_tags':'num_tags'}, inplace=True)

In [21]:
full_df.to_csv("page_tags.csv", index=False)

In [22]:
full_df['num_tags'] = full_df['num_tags'].fillna(0)
full_df['num_yes'] = full_df['num_yes'].fillna(0)
full_df['num_no'] = full_df['num_no'].fillna(0)
full_df['num_dontknow'] = full_df['num_dontknow'].fillna(0)

In [23]:
np.where(np.isnan(full_df['num_yes']))

(array([], dtype=int64),)

In [24]:
# Plurality voting
def get_vote_full(img_row):
    #print(img_row)
    num_yes = img_row['num_yes']
    num_no = img_row['num_no']
    num_dk = img_row['num_dontknow']
    # Absolute winners
    if (num_yes > num_no) and (num_yes > num_dk):
        # yes beats both
        return "yes"
    elif (num_no > num_yes) and (num_no > num_dk):
        # no beats both
        return "no"
    elif (num_dk > num_yes) and (num_dk > num_no):
        # don't know beats both
        return "don't know"
    elif num_yes == num_no and num_yes == num_dk:
        # all tie
        return "don't know"
    elif num_yes == num_no and num_yes > num_dk:
        # yes-no tie for win
        return "don't know"
    elif num_yes == num_dk and num_yes > num_no:
        # yes-dk tie for win
        return "yes"
    elif num_no == num_dk and num_no > num_yes:
        # no-dk tie for win
        return "no"
    else:
        print(num_yes, num_no, num_dk)
def get_vote_simple(img_row):
    # Simpler version: ignore the don't knows, then just do majority yes v no
    num_yes = img_row['num_yes']
    num_no = img_row['num_no']
    if num_yes > num_no:
        return "yes"
    elif num_no > num_yes:
        return "no"
    else:
        return "don't know"
page_counts = finished_df.groupby(['year','state','county'])['image_url'].agg('count')
votes_full = full_df.apply(get_vote_full, axis=1)
votes_simple = full_df.apply(get_vote_simple, axis=1)

In [25]:
votes_full.value_counts()

no            14554
yes            6375
don't know     3895
dtype: int64

In [26]:
votes_simple.value_counts()

no            14559
yes            6376
don't know     3889
dtype: int64

In [27]:
full_df['simple_vote'] = full_df.apply(get_vote_simple, axis=1)

In [28]:
num_sheets = finished_df.groupby(['year','state','county']).image_url.count()

In [29]:
no_votes = full_df.groupby(['year','state','county']).apply(lambda x: sum(x['simple_vote'] == 'no'))
yes_votes = full_df.groupby(['year','state','county']).apply(lambda x: sum(x['simple_vote'] == 'yes'))
dk_votes = full_df.groupby(['year','state','county']).apply(lambda x: sum(x['simple_vote'] == "don't know"))

In [30]:
full_df['page_count'] = 1

In [31]:
plz_df = full_df.groupby(['year','state','county']).agg('sum')

In [32]:
plz_df['no_pages'] = no_votes
plz_df['yes_pages'] = yes_votes
plz_df['dontknow_pages'] = dk_votes

In [33]:
plz_df = plz_df.drop(columns=['num_tags','num_yes','num_no','num_dontknow'])

In [34]:
plz_df.rename(columns={'simple_vote':'total_pages'},inplace=True)

In [38]:
plz_df.reset_index(inplace=True)
plz_df

Unnamed: 0,year,state,county,page_count,no_pages,yes_pages,dontknow_pages
0,1850,Alabama,Autauga,17,13,1,3
1,1850,Alabama,Baldwin,2,0,0,2
2,1850,Alabama,Barbour,6,5,0,1
3,1850,Alabama,Butler,4,3,0,1
4,1850,Alabama,Chambers,7,2,5,0
...,...,...,...,...,...,...,...
792,1860,Virginia,Surry,21,15,0,6
793,1860,Virginia,Sussex,12,12,0,0
794,1860,Virginia,Warwick,11,6,0,5
795,1860,Virginia,Washington,2,0,0,2


In [39]:
plz_df.to_csv("year_county_tags.csv", index=False)

In [37]:
plz_df.describe()

Unnamed: 0,page_count,no_pages,yes_pages,dontknow_pages
count,797.0,797.0,797.0,797.0
mean,31.146801,18.267252,8.0,4.879548
std,49.187824,32.438192,19.847091,8.786727
min,1.0,0.0,0.0,0.0
25%,6.0,2.0,0.0,1.0
50%,16.0,8.0,1.0,2.0
75%,39.0,22.0,7.0,6.0
max,697.0,508.0,237.0,90.0


In [117]:
sheets_df['no_votes'] = no_votes

In [118]:
sheets_df

year      state     county      
1850      Alabama   Autauga          15
                    Baldwin           2
                    Barbour           6
                    Butler            3
                    Chambers          7
                                   ... 
1860      Virginia  Sussex           12
                    Warwick          10
                    Washington        2
                    Westmoreland     29
no_votes                            NaN
Name: image_url, Length: 798, dtype: object