In [1]:
from transformers import BertTokenizer
import pandas as pd
import numpy as np

In [2]:
bert_model = "bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(bert_model)

In [3]:
df = pd.DataFrame(columns=["Compound",
                           "Modifier",
                           "Head",
                           "Number of tokens", 
                           "Freq Compound", 
                           "Freq Head", 
                           "Freq Modifier", 
                           "Concreteness Compound",
                           "Concreteness Head",
                           "Concreteness Modifier",
                           "Predicted LMD",
                           "Predicted ST"
                          ])
df

Unnamed: 0,Compound,Modifier,Head,Number of tokens,Freq Compound,Freq Head,Freq Modifier,Concreteness Compound,Concreteness Head,Concreteness Modifier,Predicted LMD,Predicted ST


# Set compound

In [4]:
cdf = pd.read_csv("data/compounds.csv", index_col=0)
df.Compound = cdf.Compound.str.lower()
df.Modifier = cdf.left.str.lower()
df.Head = cdf.right.str.lower()
df

Unnamed: 0,Compound,Modifier,Head,Number of tokens,Freq Compound,Freq Head,Freq Modifier,Concreteness Compound,Concreteness Head,Concreteness Modifier,Predicted LMD,Predicted ST
0,aboveground,above,ground,,,,,,,,,
1,airbase,air,base,,,,,,,,,
2,airborne,air,borne,,,,,,,,,
3,aircraft,air,craft,,,,,,,,,
4,airfield,air,field,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
624,wordplay,word,play,,,,,,,,,
625,worldwide,world,wide,,,,,,,,,
626,wristwatch,wrist,watch,,,,,,,,,
627,yardstick,yards,tick,,,,,,,,,


# Set number of tokens

In [5]:
def get_number_of_tokens(word):
    return len(tokenizer(word).data["input_ids"]) - 2
    
df["Number of tokens"] = df["Compound"].apply(get_number_of_tokens)
df

Unnamed: 0,Compound,Modifier,Head,Number of tokens,Freq Compound,Freq Head,Freq Modifier,Concreteness Compound,Concreteness Head,Concreteness Modifier,Predicted LMD,Predicted ST
0,aboveground,above,ground,2,,,,,,,,
1,airbase,air,base,2,,,,,,,,
2,airborne,air,borne,1,,,,,,,,
3,aircraft,air,craft,1,,,,,,,,
4,airfield,air,field,1,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
624,wordplay,word,play,2,,,,,,,,
625,worldwide,world,wide,1,,,,,,,,
626,wristwatch,wrist,watch,2,,,,,,,,
627,yardstick,yards,tick,2,,,,,,,,


# Set frequencies

In [6]:
word_to_freq = {}
with open("data/freq.txt") as f:
    raw = f.read().split("\n")
    for line in raw:
        word, freq = line.split()
        word_to_freq[word.lower()] = int(freq)
        
def get_word_frequency(word):
    return word_to_freq[word]

df["Freq Compound"] = df["Compound"].apply(get_word_frequency)
df["Freq Head"] = df["Head"].apply(get_word_frequency)
df["Freq Modifier"] = df["Modifier"].apply(get_word_frequency)
df.head()

Unnamed: 0,Compound,Modifier,Head,Number of tokens,Freq Compound,Freq Head,Freq Modifier,Concreteness Compound,Concreteness Head,Concreteness Modifier,Predicted LMD,Predicted ST
0,aboveground,above,ground,2,80,100,100,,,,,
1,airbase,air,base,2,100,100,100,,,,,
2,airborne,air,borne,1,100,100,100,,,,,
3,aircraft,air,craft,1,100,100,100,,,,,
4,airfield,air,field,1,100,100,100,,,,,


# Set Concretenessess

In [7]:
word_to_conc = {}
for idx, row in pd.read_csv("data/Concreteness_ratings_Brysbaert_et_al_BRM.txt", 
                            delimiter="\t", 
                            converters={'Word' : str}).iterrows():
    try:
        word = row["Word"].lower()
    except Exception as e:
        print(e)
        print(row["Word"])
        
    conc = float(row["Conc.M"])
    word_to_conc[word] = conc

def get_concreteness(word):
    
    try:
        return word_to_conc[word]
    except KeyError:
        print(f"Could not find {word}")
        return np.nan
    
df["Concreteness Compound"] = df["Compound"].apply(get_concreteness)
df["Concreteness Head"] = df["Head"].apply(get_concreteness)
df["Concreteness Modifier"] = df["Modifier"].apply(get_concreteness)
df.head()

Could not find beetroot
Could not find carryout
Could not find chamberlain
Could not find claptrap
Could not find crosswise
Could not find dovetail
Could not find edgewise
Could not find gangplank
Could not find goatherd
Could not find greasepaint
Could not find hatpin
Could not find hitherto
Could not find humdrum
Could not find masthead
Could not find onetime
Could not find poppycock
Could not find sickbay
Could not find threadbare
Could not find upshot
Could not find wanderlust
Could not find borne
Could not find lain
Could not find whet
Could not find yards


Unnamed: 0,Compound,Modifier,Head,Number of tokens,Freq Compound,Freq Head,Freq Modifier,Concreteness Compound,Concreteness Head,Concreteness Modifier,Predicted LMD,Predicted ST
0,aboveground,above,ground,2,80,100,100,4.03,4.77,3.33,,
1,airbase,air,base,2,100,100,100,4.29,3.86,4.11,,
2,airborne,air,borne,1,100,100,100,3.45,,4.11,,
3,aircraft,air,craft,1,100,100,100,4.4,3.48,4.11,,
4,airfield,air,field,1,100,100,100,4.7,4.26,4.11,,


# Set predictions

In [10]:
####################
# LMD - prediction #
####################
def string_to_list(string):
    return [float(string_float) for string_float in string[1:-1].split(", ")]

df_exp1 = pd.read_csv("data/Experiment1_LMD.csv", index_col=0)[["Compound", "bertlarge_contextual_lmd"]]
df_exp1["bertlarge_contextual_lmd"] = df_exp1["bertlarge_contextual_lmd"].apply(string_to_list).apply(lambda x: x[20])
df["Predicted LMD"] = df_exp1["bertlarge_contextual_lmd"]


####################
# ST  - prediction #
####################
df_exp2 = pd.read_csv("data/Experiment2_TRAN.csv", index_col=0)[["Compound", "bertlarge_contextual_w=0.5_tran"]]
df_exp2["bertlarge_contextual_w=0.5_tran"] = df_exp2["bertlarge_contextual_w=0.5_tran"].apply(string_to_list).apply(lambda x: x[18])
df["Predicted ST"] = df_exp2["bertlarge_contextual_w=0.5_tran"]

df

Unnamed: 0,Compound,Modifier,Head,Number of tokens,Freq Compound,Freq Head,Freq Modifier,Concreteness Compound,Concreteness Head,Concreteness Modifier,Predicted LMD,Predicted ST
0,aboveground,above,ground,2,80,100,100,4.03,4.77,3.33,5.350659,5.083935
1,airbase,air,base,2,100,100,100,4.29,3.86,4.11,4.995303,5.210649
2,airborne,air,borne,1,100,100,100,3.45,,4.11,4.162646,4.637691
3,aircraft,air,craft,1,100,100,100,4.40,3.48,4.11,4.567049,4.992505
4,airfield,air,field,1,100,100,100,4.70,4.26,4.11,4.749762,4.629141
...,...,...,...,...,...,...,...,...,...,...,...,...
624,wordplay,word,play,2,100,100,100,1.87,3.24,3.56,4.375283,4.783306
625,worldwide,world,wide,1,100,100,100,2.52,3.06,4.36,4.184853,5.041236
626,wristwatch,wrist,watch,2,100,100,100,4.85,4.61,4.93,4.384178,5.010115
627,yardstick,yards,tick,2,97,100,100,4.69,4.57,,5.174858,3.762036


# Save

In [21]:
df.to_csv("./data/statistic_exp.csv", index=False, )