In [1]:
%matplotlib inline
import os
import glob
import json
from collections import Counter, defaultdict

import urllib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Get the data from the PPLM github repo (https://github.com/uber-research/PPLM)

files = ['ctrl_legal.csv',
         'ctrl_negative.csv',
         'ctrl_politics.csv',
         'ctrl_positive.csv',
         'ctrl_religion.csv',
         'ctrl_science.csv',
         'ctrl_technologies.csv',
         'greedy_legal.csv',
         'greedy_military.csv',
         'greedy_negative.csv',
        'greedy_politics.csv',
        'greedy_positive.csv',
        'greedy_religion.csv',
        'greedy_science.csv',
        'greedy_space.csv',
        'greedy_technologies.csv'
        ]

col_lists = []
dfs = {}
base_url = 'https://raw.githubusercontent.com/uber-research/PPLM/master/human_annotation/ctrl_wd_openai_csvs/'
for infile in files:
    print(infile)
    df = pd.read_csv(base_url + infile, header=0, index_col=None)   
    col_lists.append(list(df.columns))
    dfs[infile] = df



ctrl_legal.csv
ctrl_negative.csv
ctrl_politics.csv
ctrl_positive.csv
ctrl_religion.csv
ctrl_science.csv
ctrl_technologies.csv
greedy_legal.csv
greedy_military.csv
greedy_negative.csv
greedy_politics.csv
greedy_positive.csv
greedy_religion.csv
greedy_science.csv
greedy_space.csv
greedy_technologies.csv


In [3]:
len(col_lists[0])

26

In [4]:
col_lists[0]

['From files: pickles/legal_words_pickles/legal_words_A,B.p, uber_topics_80.jsonl',
 'Unnamed: 1',
 'Which passage is more topic relevant?',
 'How fluent is the passage of A?',
 'How fluent is the passage of B?',
 'Shannon',
 'Which passage is positive?',
 'How fluent is the passage of A?.1',
 'How fluent is the passage of B?.1',
 'Raymis',
 'Which passage is positive?.1',
 'How fluent is the passage of A?.2',
 'How fluent is the passage of B?.2',
 'Hayley',
 'Unnamed: 14',
 'Unnamed: 15',
 'Unnamed: 16',
 'Unnamed: 17',
 'Unnamed: 18',
 'Unnamed: 19',
 'Unnamed: 20',
 'Unnamed: 21',
 'Unnamed: 22',
 'Unnamed: 23',
 'Unnamed: 24',
 'Unnamed: 25']

In [5]:
df.columns

Index(['From files: pickles/technology_pickles/technology_A,B.p, greedy_bow_results.json',
       'Unnamed: 1', 'Which passage is more topic relevant?',
       'How fluent is the passage of A?', 'How fluent is the passage of B?',
       'Raymis', 'Which passage is positive?',
       'How fluent is the passage of A?.1',
       'How fluent is the passage of B?.1', 'Shannon',
       'Which passage is positive?.1', 'How fluent is the passage of A?.2',
       'How fluent is the passage of B?.2', 'Hayley', 'Unnamed: 14',
       'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'],
      dtype='object')

In [6]:
df.head(3)

Unnamed: 0,"From files: pickles/technology_pickles/technology_A,B.p, greedy_bow_results.json",Unnamed: 1,Which passage is more topic relevant?,How fluent is the passage of A?,How fluent is the passage of B?,Raymis,Which passage is positive?,How fluent is the passage of A?.1,How fluent is the passage of B?.1,Shannon,...,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25
0,Emphasised are the three main principles: that...,Emphasised are the effects of the drug and the...,Neither,4,4,,Neither,4,4,,...,,,,,,,,,,41_153_306
1,"Furthermore, he said, the government's plan to...","Furthermore, in the case of the data, this cou...",Both,4,5,,Both,4,3,,...,,,,,,,,,,24_174_29
2,The central theme of the current discussion ab...,The central theme of our work with digital med...,A,4,3,,A,4,3,,...,,,,,,,,,,27_710_233


In [7]:
# Look at the number of times each annotator appears
name_counter = Counter()
for infile in files:
    df = dfs[infile]
    columns = list(df.columns)        
    annotators = [columns[i].title() for i in [5, 9, 13]]
    name_counter.update(annotators)
for n, c in name_counter.most_common():
    print(n, c)

Shannon 16
Raymis 16
Hayley 16


In [8]:
dfs.keys()

dict_keys(['ctrl_legal.csv', 'ctrl_negative.csv', 'ctrl_politics.csv', 'ctrl_positive.csv', 'ctrl_religion.csv', 'ctrl_science.csv', 'ctrl_technologies.csv', 'greedy_legal.csv', 'greedy_military.csv', 'greedy_negative.csv', 'greedy_politics.csv', 'greedy_positive.csv', 'greedy_religion.csv', 'greedy_science.csv', 'greedy_space.csv', 'greedy_technologies.csv'])

In [94]:
#del(df_out)

In [9]:
# assign model IDs based on information in the dataset code/documentation
models1 = {0: 'CTRL', 1: 'BCR'}
models2 = {0: 'WD', 1: 'BCR'}
#pair_model1 = {0: 'B-BC', 1: 'B-BR', 2: 'B-BCR', 3: 'BC-BR', 4: 'BC-BCR', 5: 'BR-BCR'}
#pair_model_ids1 = dict([(v,k) for k,v in pair_model.items()])
rows = []
text_counter = Counter()
for infile in files:
    df = dfs[infile]
    #df['pairid'] = (np.arange(len(df))+1).tolist()
    columns = list(df.columns)
    annotators = []
    base = os.path.basename(infile)
    basename = os.path.splitext(base)[0]
    print("basename:", basename)
    dom = basename.split('_')[0]
    print("dom:", dom)
    if str(dom) == 'ctrl':
        models = models1
    else:
        models = models2
    for i in [5, 9, 13]:
        if columns[i].startswith('Unnamed'):
            name = 'Unnamed_' + basename
        else:
            name = columns[i].title()
        annotators.append(name)

    keys = df[columns[-1]].values # unique id of pairwise samples
    ratings = df[[columns[i] for i in[3, 4, 7, 8, 11, 12]]].values
    texts_a = df[columns[0]].values
    texts_b = df[columns[1]].values
    relevance = df[[columns[i] for i in[2,6,10]]].values
    #pairs = df[columns[-1]].values
    
    #print(keys)
    
    for line_i, key in enumerate(keys):
        if type(key) == str:
            parts = key.split('_')
            #print(int(parts[1]) % 2)
            #print(int(parts[2]) % 2)
            model_a = models[int(parts[1]) % 2]
            model_b = models[int(parts[2]) % 2]
            text_a = texts_a[line_i]
            text_b = texts_b[line_i]
            text_counter.update([text_a])
            text_counter.update([text_b]) 
            #print("model_a:", model_a)
            #print("model_b:", model_b)
            #print()
            pairtxt=str(model_a)+'-'+str(model_b)
            
            
            if not np.isnan(ratings[line_i, 0]):
                rows.append([text_a, basename, model_a, annotators[0], ratings[line_i, 0], relevance[line_i,0], key, pairtxt])
            if not np.isnan(ratings[line_i, 2]):
                rows.append([text_a, basename, model_a, annotators[1], ratings[line_i, 2] , relevance[line_i,1], key, pairtxt])
            if not np.isnan(ratings[line_i, 4]):                
                rows.append([text_a, basename, model_a, annotators[2], ratings[line_i, 4], relevance[line_i,2], key, pairtxt])
            if not np.isnan(ratings[line_i, 1]):
                rows.append([text_b, basename, model_b, annotators[0], ratings[line_i, 1], relevance[line_i,0], key, pairtxt])
            if not np.isnan(ratings[line_i, 3]): 
                rows.append([text_b, basename, model_b, annotators[1], ratings[line_i, 3], relevance[line_i,1], key, pairtxt])
            if not np.isnan(ratings[line_i, 5]):    
                rows.append([text_b, basename, model_b, annotators[2], ratings[line_i, 5], relevance[line_i,2], key, pairtxt])
            
df_out = pd.DataFrame(rows, columns=['text', 'domain', 'model', 'annotator', 'rating', 'relevance','key', 'pairtxt'])
df_out.head(20)

basename: ctrl_legal
dom: ctrl
basename: ctrl_negative
dom: ctrl
basename: ctrl_politics
dom: ctrl
basename: ctrl_positive
dom: ctrl
basename: ctrl_religion
dom: ctrl
basename: ctrl_science
dom: ctrl
basename: ctrl_technologies
dom: ctrl
basename: greedy_legal
dom: greedy
basename: greedy_military
dom: greedy
basename: greedy_negative
dom: greedy
basename: greedy_politics
dom: greedy
basename: greedy_positive
dom: greedy
basename: greedy_religion
dom: greedy
basename: greedy_science
dom: greedy
basename: greedy_space
dom: greedy
basename: greedy_technologies
dom: greedy


Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR
3,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Shannon,3,B,16_882_839,CTRL-BCR
4,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Raymis,2,B,16_882_839,CTRL-BCR
5,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Hayley,3,B,16_882_839,CTRL-BCR
6,"Emphasised are the words ""maybe"" and ""possibly...",ctrl_legal,BCR,Shannon,2,B,14_849_744,BCR-CTRL
7,"Emphasised are the words ""maybe"" and ""possibly...",ctrl_legal,BCR,Raymis,4,B,14_849_744,BCR-CTRL
8,"Emphasised are the words ""maybe"" and ""possibly...",ctrl_legal,BCR,Hayley,4,B,14_849_744,BCR-CTRL
9,Emphasised are: the importance of the individu...,ctrl_legal,CTRL,Shannon,3,B,14_849_744,BCR-CTRL


In [10]:
set(df_out.pairtxt.values)

{'BCR-CTRL', 'BCR-WD', 'CTRL-BCR', 'WD-BCR'}

In [11]:
len(df_out)

3840

In [12]:
ratings_per_text = defaultdict(list)
for row in rows:
    text = row[0]
    rating = row[-4]
    ratings_per_text[text].append(rating)
print(len(ratings_per_text))

1141


In [13]:
ratings_per_text

defaultdict(list,
            {'The relationship between the two cities is already well defined in the way they interact with the rest of the country. But with an estimated $1.1 billion in proposed infrastructure, and an eye on the future, a new study by Boston Consulting Group and the Commonwealth of Massachusetts suggests that if Boston is to maintain its lead in the country as the most livable city in America, it needs to be': [3,
              3,
              2,
              4,
              4,
              4],
             'The relationship between my girlfriend and I is pretty good. We have been together for about a year now but we are both in college. She has had her share of problems with alcohol abuse which she admits to having. \n \n Recently she was arrested on charges that were later dropped. Her parents told me they would pay the court fees if I paid them back some money from what I made off of her. They': [3,
              2,
              3],
             'Emphasised 

In [14]:
# convert everything to indices
text_counter = Counter(df_out['text'].values)
text_index = dict(zip(sorted(text_counter), range( len(text_counter))))
text_indices = ['I' + str(text_index[t]) for t in df_out['text'].values]
df_out['item_id'] = text_indices

model_counter = Counter(df_out['model'].values)
model_index = dict(zip(sorted(model_counter), range(len(model_counter))))
model_indices = [model_index[t] for t in df_out['model'].values]

annotator_counter = Counter(df_out['annotator'].values)
annotator_index = dict(zip(sorted(annotator_counter), range(len(annotator_counter))))
annotator_indices = ['W' + str(annotator_index[t]) for t in df_out['annotator'].values]
df_out['worker_id'] = annotator_indices

ratings = [float(r) for r in df_out['rating'].values]
print(len(text_indices), len(model_indices), len(annotator_indices), len(ratings))

rating_counter = Counter(ratings)
print(len(rating_counter))
df_out.head()

3840 3840 3840 3840
5


Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt,item_id,worker_id
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR,I861,W2
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR,I861,W1
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR,I861,W0
3,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Shannon,3,B,16_882_839,CTRL-BCR,I844,W2
4,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Raymis,2,B,16_882_839,CTRL-BCR,I844,W1


In [15]:
len(df_out)

3840

In [16]:
print(set(df_out['model'].values))

{'WD', 'BCR', 'CTRL'}


In [17]:
print(set(df_out['rating']))

{1, 2, 3, 4, 5}


### Mapping Fluency rating score to category
x < 3 : Poor

x ==3 : Moderate

x > 3 : High

In [18]:
fluency_cat = []
for r in df_out.rating.values:
    cat = ""
    if int(r)==3:
        cat = "moderate"
    elif int(r) < 3:
        cat="poor"
    elif int(r) > 3:
        cat="high"
    fluency_cat.append(cat)
        
df_out['fluency_cat'] = np.array(fluency_cat)
df_out.head()

Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt,item_id,worker_id,fluency_cat
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR,I861,W2,moderate
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR,I861,W1,moderate
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR,I861,W0,poor
3,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Shannon,3,B,16_882_839,CTRL-BCR,I844,W2,moderate
4,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Raymis,2,B,16_882_839,CTRL-BCR,I844,W1,poor


### Mapping relevance to its binary value
x == 'neither' --> No (0)

x == A & model == pairtxt[0] --> Yes (1); else --> No (0)

x == B & model == pairtxt[1] --> Yes (1) ; else --> No (0)


In [19]:
relev_bin = []
for i, r in enumerate(df_out.relevance.values):
    model_i = str(df_out.model.iloc[i])
    pair_i = df_out.pairtxt.iloc[i].split('-')
    
    rbin = 0
    if str(r)=='neither':
        rbin = 0
    elif str(r)=='both':
        rbin = 1
    elif str(r)=='A':
        if (str(r)=='A') and (pair_i[0] == model_i):
            rbin = 1
        else:
            rbin = 0
    elif str(r)=='B':
        if (str(r)=='B') and (pair_i[1] == model_i):
            #print(model_i)
            #print(pair_i[1])
            #print()
            rbin = 1
        else:
            rbin = 0
    relev_bin.append(rbin)
        
df_out['relevance_bin'] = np.array(relev_bin)
df_out.head()

Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt,item_id,worker_id,fluency_cat,relevance_bin
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR,I861,W2,moderate,0
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR,I861,W1,moderate,0
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR,I861,W0,poor,0
3,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Shannon,3,B,16_882_839,CTRL-BCR,I844,W2,moderate,1
4,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Raymis,2,B,16_882_839,CTRL-BCR,I844,W1,poor,1


In [20]:
model_code_map = {'CTRL': 1, 'BCR':2 , 'WD': 3}
model_codes = [model_code_map[m] for m in df_out['model'].values]
df_out['model_code'] = model_codes
df_out.head()

Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt,item_id,worker_id,fluency_cat,relevance_bin,model_code
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR,I861,W2,moderate,0,1
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR,I861,W1,moderate,0,1
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR,I861,W0,poor,0,1
3,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Shannon,3,B,16_882_839,CTRL-BCR,I844,W2,moderate,1,2
4,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Raymis,2,B,16_882_839,CTRL-BCR,I844,W1,poor,1,2


In [21]:
df_out['text'].iloc[2]

'The relationship between the two cities is already well defined in the way they interact with the rest of the country. But with an estimated $1.1 billion in proposed infrastructure, and an eye on the future, a new study by Boston Consulting Group and the Commonwealth of Massachusetts suggests that if Boston is to maintain its lead in the country as the most livable city in America, it needs to be'

In [22]:
text_tok = [t.split() for t in df_out.text.values]

In [23]:
lens=[]
for tok in text_tok:
    lens.append(len(tok))
df_out['text_len'] = np.array(lens)
df_out.head()

Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt,item_id,worker_id,fluency_cat,relevance_bin,model_code,text_len
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR,I861,W2,moderate,0,1,71
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR,I861,W1,moderate,0,1,71
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR,I861,W0,poor,0,1,71
3,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Shannon,3,B,16_882_839,CTRL-BCR,I844,W2,moderate,1,2,75
4,The relationship between my girlfriend and I i...,ctrl_legal,BCR,Raymis,2,B,16_882_839,CTRL-BCR,I844,W1,poor,1,2,75


In [24]:
np.mean(lens)

69.84765625

In [25]:
np.std(lens)

15.159343575228972

In [26]:
df_out.columns

Index(['text', 'domain', 'model', 'annotator', 'rating', 'relevance', 'key',
       'pairtxt', 'item_id', 'worker_id', 'fluency_cat', 'relevance_bin',
       'model_code', 'text_len'],
      dtype='object')

In [29]:
len(df_out.text)

3840

In [30]:
prefix = [' '.join(t.split()[:2]) for t in df_out.text]
df_out['prefix'] = np.array(prefix)

In [31]:
df_out.head(3)

Unnamed: 0,text,domain,model,annotator,rating,relevance,key,pairtxt,item_id,worker_id,fluency_cat,relevance_bin,model_code,text_len,prefix
0,The relationship between the two cities is alr...,ctrl_legal,CTRL,Shannon,3,B,16_882_839,CTRL-BCR,I861,W2,moderate,0,1,71,The relationship
1,The relationship between the two cities is alr...,ctrl_legal,CTRL,Raymis,3,B,16_882_839,CTRL-BCR,I861,W1,moderate,0,1,71,The relationship
2,The relationship between the two cities is alr...,ctrl_legal,CTRL,Hayley,2,B,16_882_839,CTRL-BCR,I861,W0,poor,0,1,71,The relationship


In [51]:
outdir = os.path.join('../data', 'CTRL')
if not os.path.exists(outdir):
    os.makedirs(outdir)

df_out.to_csv(os.path.join(outdir, 'ctrl_all.csv'))