# Investigate Pararel data

Use this notebook to investigate subject and subject-object duplicates per relation in ParaRel. 

In [1]:
import os
os.chdir('..') 
os.getcwd()

'/cephyr/users/lovhag/Alvis/projects/pararel'

In [2]:
import pandas as pd
import os 
import pickle
import numpy as np

from pararel.consistency import utils
from pararel.consistency.lm_pipeline import parse_prompt

  from .autonotebook import tqdm as notebook_tqdm


We can for the analysis switch between the original datafile `trex_lms_vocab_old` and our corresponding deduplicated datafile `trex_lms_vocab`.

In [5]:
data_path = "/cephyr/users/lovhag/Alvis/projects/pararel/data"
lama_path = os.path.join(data_path, "trex_lms_vocab_old")
graph_path = os.path.join(data_path, "pattern_data", "graphs")

In [6]:
relation_names = {"P17": "located-in",
             "P19": "born-in", 
             "P20": "died-in", 
             "P27": "citizen-of",
             "P30": "located-in-continent",
             "P36": "capital-of",
             "P37": "official-language",
             "P39": "has-position",
             "P47": "shares-border-with",
             "P101": "specializes-in",
             "P103": "native-language",
             "P106": "is-a-by-profession",
             "P108": "works-for",
             "P127": "owned-by",
             "P131": "located-in",
             "P136": "plays-music",
             "P138": "named-after",
             "P140": "affiliated-with-religion",
             "P159": "headquarter-in",
             "P176": "produced-by",
             "P178": "developed-by",
             "P190": "twin-city-of",
             "P264": "represented-by-music-label",
             "P276": "located-in",
             "P279": "subclass-of", 
             "P361": "part-of",
             "P364": "original-language",
             "P407": "written-in-language",
             "P413": "plays-in-position",
             "P449": "originally-aired-on",
             "P463": "member-of",
             "P495": "created-in",
             "P530": "has-diplomatic-relations-with",
             "P740": "founded-in",
             "P937": "worked-in",
             "P1001": "legal-term-in",
             "P1303": "is-a-player",
             "P1376": "capital-of",
             "P1412": "communicated-in"}

In [7]:
investigated_relations = ["P937","P1412","P127","P103","P276","P159","P140","P136","P495","P17","P361","P36","P740","P264","P407","P138","P30","P131","P176","P449","P279","P19","P101","P364","P106","P1376","P178","P37","P413","P27","P20"]
print(investigated_relations)

['P937', 'P1412', 'P127', 'P103', 'P276', 'P159', 'P140', 'P136', 'P495', 'P17', 'P361', 'P36', 'P740', 'P264', 'P407', 'P138', 'P30', 'P131', 'P176', 'P449', 'P279', 'P19', 'P101', 'P364', 'P106', 'P1376', 'P178', 'P37', 'P413', 'P27', 'P20']


## Systematic analysis

In [9]:
duplicate_subj_data = pd.DataFrame()
stat_data = pd.DataFrame()
for relation in investigated_relations:
    try:
        data = utils.read_jsonl_file(os.path.join(lama_path, relation + ".jsonl"))
    except:
        print(f"There was an error reading the data for {os.path.join(lama_path, relation + '.jsonl')}. Skipping.")
        continue

    tmp_data = pd.DataFrame()
    for val in data:
        val["relation"] = relation
        val["relation_name"] = relation_names[relation]
        tmp_data = tmp_data.append(val, ignore_index=True)
        
    nbr_matching = 0
    nbr_stupid_matching = 0
    duplicated_sub_labels = []    
    for sub_label in tmp_data.sub_label.unique():
        matching = tmp_data[tmp_data.sub_label==sub_label]
        if len(matching) > 1:
            duplicated_sub_labels.append(sub_label)
            nbr_matching += len(matching)
            # also look for pure duplicates (both subj label and obj label matching)
            tmp_sub_data = tmp_data[tmp_data.sub_label==sub_label].copy()
            for obj_label in tmp_sub_data.obj_label.unique():
                nbr_stupid_matching += len(tmp_sub_data[tmp_sub_data.obj_label==obj_label])-1
    stat_data = stat_data.append({"relation": relation, 
                                  "relation_name": relation_names[relation],
                                  "nbr": int(len(data)),
                                  "nbr_duplicates": int(nbr_matching),
                                  "nbr_stupid_duplicates": int(nbr_stupid_matching)},
                                  ignore_index=True)
    
    duplicated_mask = tmp_data.sub_label.isin(duplicated_sub_labels)
    duplicate_subj_data = duplicate_subj_data.append(tmp_data[duplicated_mask].sort_values(by=["sub_label"]), ignore_index=True)

stat_data["nbr"] = stat_data["nbr"].astype('int')
stat_data["nbr_duplicates"] = stat_data["nbr_duplicates"].astype('int')
stat_data["nbr_stupid_duplicates"] = stat_data["nbr_stupid_duplicates"].astype('int')
stat_data = stat_data.set_index("relation").sort_index()
stat_data = stat_data[['relation_name', 'nbr', 'nbr_duplicates', 'nbr_stupid_duplicates']]
stat_data = stat_data.sort_index(key=lambda row: [int(val.replace("P","")) for val in row])
stat_data

Unnamed: 0_level_0,relation_name,nbr,nbr_duplicates,nbr_stupid_duplicates
relation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P17,located-in,912,2,0
P19,born-in,779,0,0
P20,died-in,817,0,0
P27,citizen-of,958,0,0
P30,located-in-continent,959,4,0
P36,capital-of,471,14,1
P37,official-language,900,280,0
P101,specializes-in,571,52,0
P103,native-language,919,2,0
P106,is-a-by-profession,821,0,0


In [12]:
stat_data.nbr.sum()

23097

In [11]:
stat_data.nbr_duplicates.sum()

647

In [13]:
stat_data.nbr_stupid_duplicates.sum()

15

In [41]:
print(stat_data.to_latex())

\begin{tabular}{llrrr}
\toprule
{} &               relation\_name &  nbr &  nbr\_duplicates &  nbr\_stupid\_duplicates \\
relation &                             &      &                 &                        \\
\midrule
P17      &                  located-in &  912 &               2 &                      0 \\
P19      &                     born-in &  779 &               0 &                      0 \\
P20      &                     died-in &  817 &               0 &                      0 \\
P27      &                  citizen-of &  958 &               0 &                      0 \\
P30      &        located-in-continent &  959 &               4 &                      0 \\
P36      &                  capital-of &  471 &              14 &                      1 \\
P37      &           official-language &  900 &             280 &                      0 \\
P101     &              specializes-in &  571 &              52 &                      0 \\
P103     &             native-language & 

In [12]:
stat_data.nbr.sum()

21830

In [25]:
duplicate_subj_data[duplicate_subj_data.relation=="P449"]

Unnamed: 0,obj_label,relation,relation_name,sub_label,uuid
276,NBC,P449,originally-aired-on,Alfred Hitchcock Presents,4f9db489-5667-4c43-9810-e57f61d8ab30
277,CBS,P449,originally-aired-on,Alfred Hitchcock Presents,64b9c063-dbab-4fb0-9e43-5c1ff67caf45
278,NBC,P449,originally-aired-on,Alfred Hitchcock Presents,1e510088-f748-4972-b326-b0d4169402d0
279,CBS,P449,originally-aired-on,Down You Go,044e086f-26a6-4421-a6c7-a65710748ba1
280,NBC,P449,originally-aired-on,Down You Go,e1b4880e-7cc2-464e-9879-2982085f4ee9
281,NBC,P449,originally-aired-on,Gambit,f937db7c-018a-45e8-8cf0-b62bc82c4544
282,CBS,P449,originally-aired-on,Gambit,cf421677-f1c7-4014-be63-65c538b608fd
283,NBC,P449,originally-aired-on,Underdog,0e40459c-47ac-4842-9d51-1bf248d75479
284,CBS,P449,originally-aired-on,Underdog,ab9fd026-c25b-4a4b-a7f0-502bd7cdb361


In [34]:
duplicate_subj_data.to_csv("duplicated_subjects_data.csv", index=False)

In [67]:
duplicate_subj_data.relation.unique()

array(['P937', 'P1412', 'P103', 'P276', 'P159', 'P140', 'P136', 'P495',
       'P17', 'P361', 'P36', 'P264', 'P407', 'P138', 'P30', 'P176',
       'P449', 'P279', 'P101', 'P364', 'P1376', 'P178', 'P37'],
      dtype=object)

## Check number of answer alternatives for original and deduplicated data

In [7]:
data_paths = {"original": os.path.join(data_path, "trex_lms_vocab"), "deduplicated": os.path.join(data_path, "trex_lms_vocab_deduplicated")}
all_objects = {}
for key, path in data_paths.items():
    all_objects[key] = {}
    for relation in investigated_relations:
        try:
            data = utils.read_jsonl_file(os.path.join(path, relation + '.jsonl'))
        except:
            print(f"There was an error reading the data for {os.path.join(path, relation + '.jsonl')}. Skipping.")
            continue
        
        all_objects[key][relation] = list(set([x['obj_label'] for x in data]))
        

There was an error reading the data for /cephyr/users/lovhag/Alvis/projects/pararel/data/trex_lms_vocab_deduplicated/P37.jsonl. Skipping.


In [12]:
[obj for obj in all_objects['original']["P101"] if obj not in all_objects['deduplicated']["P101"]]

['asteroid', 'comet', 'folklore']

In [17]:
for relation in investigated_relations:
    # skip the relation that has been dropped for the deduplicated data
    if relation=="P37":
        continue
    print(f"Relation {relation} number of options")
    print(f"'original': {len(all_objects['original'][relation])}")
    print(f"'deduplicated': {len(all_objects['deduplicated'][relation])}") 
    print()
    print("Removed options:")
    print([obj for obj in all_objects['original'][relation] if obj not in all_objects['deduplicated'][relation]])
    print("-----------------")

Relation P937 number of options
'original': 93
'deduplicated': 92

Removed options:
['Cincinnati']
-----------------
Relation P1412 number of options
'original': 38
'deduplicated': 38

Removed options:
[]
-----------------
Relation P127 number of options
'original': 194
'deduplicated': 194

Removed options:
[]
-----------------
Relation P103 number of options
'original': 30
'deduplicated': 30

Removed options:
[]
-----------------
Relation P276 number of options
'original': 262
'deduplicated': 242

Removed options:
['Michigan', 'Austria', 'Luxembourg', 'Gujarat', 'Albania', 'Mongolia', 'Latvia', 'Azerbaijan', 'Wisconsin', 'Ghana', 'California', 'Armenia', 'Canada', 'Egypt', 'Bahrain', 'Kazakhstan', 'Texas', 'Switzerland', 'Iran', 'Levant']
-----------------
Relation P159 number of options
'original': 190
'deduplicated': 189

Removed options:
['Golden']
-----------------
Relation P140 number of options
'original': 10
'deduplicated': 10

Removed options:
[]
-----------------
Relation P13