# Prepare the environment

In [None]:
# Connect to drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
HEAD_DIR = "/content/gdrive/MyDrive/MS_postprocess_code/specificity"

# ADJECTIVE SPECIFICITY

## Prep

### Before executing, create an venv with either Python 3.9 or 3.10 and install nltk and numpy, as well as the wordnet package for nltk

In [None]:
!pip install nltk
!pip install numpy

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 578, in _determine_conflicts
    return check_install_conflicts(to_install)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/operations/check.py", line 101, in check_install_conflicts
    package_set, _ = create_package_set_from_installed()
              

In [None]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Calculate the adjective specificity

In [None]:
from nltk.corpus import wordnet as wn
import numpy as np
import json

In [None]:
def calculate_adjective_specificity(adjective):
    """
    Calculates the specificity score of an adjective
    """
    synsets = wn.synsets(adjective, pos=wn.ADJ)
    if synsets:
        lemmas = set()
        antonyms = set()
        total_ssw = 0
        for synset in synsets:
            lemma_names = synset.lemma_names()
            for lemma in lemma_names:
                lemmas.add(lemma) ## crawls all lemmas of all synsets corresponding to the adjective
            for lemma in synset.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym) ## crawls all antonyms if all synsets and lemmas corresponding to the adjective
            ssw = sum(len(similar.lemmas()) for similar in synset.similar_tos()) ## Counts all semantic-similar synsets of the adjective
            total_ssw += ssw

        synonym_set_size = len(lemmas)
        antonym_number = len(antonyms)
        total_number_of_cluster = total_ssw + antonym_number + synonym_set_size
        if total_ssw != 0 or antonym_number != 0 or synonym_set_size != 0:
            specificity_score = 1/np.log(1+total_number_of_cluster+len(synsets)) ## Calculates the specificity score
        else:
            specificity_score = float("inf")
        return specificity_score
    else: return f"There are no synsets for the adjective: {adjective}"

# NOUN SPECIFICITY

In [None]:
import sys, os
from nltk.corpus import wordnet as wn
import csv

In [None]:
def create_folder(filepath):
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):
        os.makedirs(directory)


def read_ratings(inputf):

    ratings_nouns = {}
    with open(inputf, 'r') as fratings:
        fratings = csv.reader(fratings, delimiter='\t')
        next(fratings)
        for row in fratings:
            if row[8] == "Noun":
                rating_score = row[2].replace(',', '.')
                ratings_nouns[row[0]] = float(rating_score)

    return ratings_nouns

def wn_closure(noun, score):

    first_sense = wn.synsets(noun, pos='n')[0]

    """
    Get hypernym closures
    """

    hypernym_closure = lambda s:s.hypernyms()

    inherited_hypernyms = {}
    inherited_hypernyms[(first_sense, len(list(first_sense.closure(hypernym_closure))),)] = list(first_sense.closure(hypernym_closure))

    """
    Get hypernyms based on instance of relations
    """

    for synset, hyper_list in inherited_hypernyms.copy().items():
        if synset[1] == 0:
            hypo_instance_of = synset[0].instance_hypernyms()
            if len(hypo_instance_of) > 0:
                inherited_hypernyms.pop(synset, None)
                inherited_hypernyms[(synset[0], len(list(hypo_instance_of[0].closure(hypernym_closure))),)] = list(
                    hypo_instance_of[0].closure(hypernym_closure))
        else:
            continue

    """
    Depth from Entity node
    """
    depth_measure = {}
    for synset_depth_val, list_hypern in  inherited_hypernyms.items():
        depth_measure[noun] = float(synset_depth_val[1])
    print(depth_measure)

    """
    Group scores for correlations
    """
    group_scores = {}
    final_scores = {}

    for entry, values in depth_measure.items():
        # group_scores[entry] = (keywords[entry], values,)
        group_scores[entry] = (score, values,)

    for entry, scores in group_scores.items():
        human_judgement, depth = scores
        ladder = float(depth * 5) / float(20) # 20 is the max.depht (or max distance from entity node) in WN30
        final_scores[entry] = (human_judgement, depth, ladder)


    return final_scores

In [None]:
input_files = ["both_154_7879_160721_20_05_2025_features_concrete",
               #"both_154_7879_120801_20_05_2025_features",
               #"both_154_7879_121629_20_05_2025_plo_features"
               ]

In [None]:
with open(f"{HEAD_DIR}/{input_files[0]}.json") as f:
        wn_top_down_print = {}
        data_dict = json.load(f)
        for _, data in data_dict.items():
            for k in ("zero-shot", "few-shot"):
                if k not in data:
                    continue
                for vlm, answers in data[k].items():
                    if isinstance(answers, dict):
                        for i, w in enumerate(answers["concreteness_words"]):
                            if w[8] == "Adjective":
                                adj_score = calculate_adjective_specificity(w[0])
                                print(f"Adjective '{w[0]}' has specificity score ",adj_score)
                                # Add specificity score to the concreteness list for the iteration simplicity
                                answers["concreteness_words"][i][7] = adj_score

                            if w[8] == "Noun":
                                elem = w[0]
                                score = w[2]
                                wn_synsets = wn.synsets(elem, pos='n')
                                if len(wn_synsets) >=1:
                                    data_correlation = wn_closure(elem, score)
                                    wn_top_down_print.update(data_correlation)

        for _, data in data_dict.items():
            for k in ("zero-shot", "few-shot"):
                if k not in data:
                    continue
                for vlm, answers in data[k].items():
                    if isinstance(answers, dict):
                        for i, w in enumerate(answers["concreteness_words"]):
                            if w[8] == "Noun":
                                spec_val = []
                                for k, v in wn_top_down_print.items():
                                    rating_human, depth, specificity = v
                                    spec_val.append(float(specificity))

                                min_score = min(spec_val)
                                max_score = max(spec_val)

                                # final_scores = {}
                                try:
                                    rating_human, depth, specificity = wn_top_down_print[w[0]]
                                    normalized_score = ((specificity - min_score) / (max_score - min_score)) * (5.0 - 1.0) + 1.0
                                    answers["concreteness_words"][i][5] = rating_human
                                    answers["concreteness_words"][i][6] = depth
                                    answers["concreteness_words"][i][7] = normalized_score
                                # catch the nouns unknown to wordnet and ignore them
                                except KeyError:
                                    continue

with open(f"{HEAD_DIR}/{input_files[0]}_concrete_specific.json", "w") as f:
    json.dump(data_dict, f, indent=2)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
{'apartment': 7.0}
Adjective 'generic' has specificity score  0.5138983423697507
{'representation': 5.0}
{'suspect': 8.0}
Adjective 'real' has specificity score  0.28126641406272834
{'news': 5.0}
{'report': 6.0}
Adjective 'different' has specificity score  0.2790553132756236
{'person': 7.0}
{'image': 6.0}
{'caption': 8.0}
{'ai': 8.0}
Adjective 'contraband' has specificity score  0.48089834696298783
{'good': 6.0}
{'bread': 10.0}
{'delivery': 9.0}
Adjective 'fictional' has specificity score  0.45511961331341866
{'scenario': 7.0}
{'man': 10.0}
Adjective 'consistent' has specificity score  0.3001016285004131
Adjective 'real' has specificity score  0.28126641406272834
{'world': 5.0}
{'reporting': 6.0}
{'image': 6.0}
{'caption': 8.0}
{'ai': 8.0}
Adjective 'harmful' has specificity score  0.2835784920513334
Adjective 'misleading' has specificity score  0.5138983423697507
{'context': 6.0}
Adjective 'fake' has spe

# Новый раздел

In [None]:
"""
Usage: python3 specificty3.py [./materials/Concreteness_ratings_Brysbaert_et_al_BRM.txt]
"""

"""
read Breysbart ratings - only nouns
"""

ratings_cogsci = "/content/gdrive/MyDrive/MS_postprocess_code/concreteness/data/Concreteness_ratings_Brysbaert_et_al_BRM.txt"
keywords = read_ratings(ratings_cogsci)


"""
apply measure to nouns in ratings
"""

wn_top_down_print = {}


for elem, score in keywords.items():
    wn_synsets = wn.synsets(elem, pos='n')
    if len(wn_synsets) >=1:
        data_correlation = wn_closure(elem, score)
        wn_top_down_print.update(data_correlation)

print("Printing output...")

# outdir = '/data/p281734/Specificty3/'
# create_folder(outdir)

# outfile = HEAD_DIR + "specificity3.csv"

spec_val = []
for k, v in wn_top_down_print.items():
    rating_human, depth, specificity = v
    spec_val.append(float(specificity))

min_score = min(spec_val)
max_score = max(spec_val)

final_scores = {}
for k, v in wn_top_down_print.items():
    rating_human, depth, specificity = v
    normalized_score = ((specificity - min_score) / (max_score - min_score)) * (5.0 - 1.0) + 1.0
    final_scores[k] = (v[0], v[1], normalized_score,)

with open(outfile, 'a') as outcsv:
    # configure writer to write standard csv file
    writer = csv.writer(outcsv, delimiter=',', lineterminator='\n')
    writer.writerow(['token', 'rating-human', 'depth', 'normalized_specificity'])
    for elem, values in final_scores.items():
        # Write item to outcsv
        writer.writerow([elem, values[0], values[1], values[2]])

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
{'paleontology': 10.0}
{'premises': 8.0}
{'preview': 6.0}
{'rehearsal': 8.0}
{'riffle': 7.0}
{'shortcut': 7.0}
{'tenet': 8.0}
{'thud': 6.0}
{'abortionist': 13.0}
{'aerospace': 5.0}
{'alumni': 10.0}
{'aphrodisiac': 8.0}
{'bandwidth': 5.0}
{'blip': 8.0}
{'cameo': 17.0}
{'chump': 9.0}
{'clatter': 7.0}
{'concussion': 10.0}
{'cosmos': 5.0}
{'decade': 5.0}
{'defender': 9.0}
{'discoloration': 5.0}
{'duelist': 9.0}
{'facade': 8.0}
{'facsimile': 8.0}
{'fermentation': 6.0}
{'fool': 9.0}
{'genetics': 11.0}
{'ghost': 9.0}
{'grammar': 10.0}
{'gridlock': 8.0}
{'job': 6.0}
{'liberator': 10.0}
{'luminescence': 11.0}
{'menopause': 5.0}
{'murmur': 6.0}
{'nappy': 10.0}
{'osteoporosis': 9.0}
{'ovulation': 4.0}
{'particulate': 8.0}
{'prettiness': 6.0}
{'professorship': 8.0}
{'region': 4.0}
{'rewriter': 11.0}
{'scheduler': 6.0}
{'screener': 11.0}
{'slut': 12.0}
{'storminess': 9.0}
{'supremacist': 9.0}
{'symptom': 6.0}
{'tidbit