# Setup

In [1]:
import os
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizer

import utils
import vsm

In [2]:
VSM_HOME = os.path.join('data', 'vsmdata')
DATA_HOME = os.path.join('data', 'wordrelatedness')

In [3]:
utils.fix_random_seeds()

In [4]:
dev_df = pd.read_csv(
    os.path.join(DATA_HOME, "cs224u-wordrelatedness-dev.csv"))

There are many different options for pre-trained weights. See the following: https://huggingface.co/models. It might be worth exploring other weights.

# Decontextualized Approach Model

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

def run_knn_score_model(vsm_df, dev_df, test_size=0.20):
    
    # Complete `knn_feature_matrix` for this step.
    X = knn_feature_matrix(vsm_df, dev_df)
    
    # Get the values of the 'score' column in `dev_df`
    # and store them in a list or array `y`.
    y = dev_df['score']

    
    # Use `train_test_split` to split (X, y) into train and
    # test protions, with `test_size` as the test size.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)


    # Instantiate a `KNeighborsRegressor` with default arguments:
    neigh = KNeighborsRegressor()

    # Fit the model on the training data:
    neigh.fit(X_train, y_train)

    # Return the value of `score` for your model on the test split
    # you created above:
    return -(neigh.score(X_test, y_test))
    
    
def knn_feature_matrix(vsm_df, rel_df):
    # Complete `knn_represent` and use it to create a feature
    # matrix `np.array`:
    matrix = np.zeros((len(rel_df), len(vsm_df.columns)*2))
    for ind in rel_df.index:
        matrix[ind] = knn_represent(rel_df['word1'][ind], rel_df['word2'][ind], vsm_df)
    return matrix

def knn_represent(word1, word2, vsm_df):
    # Use `vsm_df` to get vectors for `word1` and `word2`
    # and concatenate them into a single vector:
    return np.concatenate((vsm_df.loc[word1], vsm_df.loc[word2]), axis=None)


In [5]:
bert_weights_name = 'bert-base-uncased'

bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

In [41]:
def apply_bert(rel_df, layer, pool_func): 
    vocab = list(vsm_index.index)
    pooled_df = vsm.create_subword_pooling_vsm(vocab, bert_tokenizer, bert_model, layer, pool_func)
    return pooled_df

def evaluate_pooled_bert(rel_df, layer, pool_func):
    pooled_df = apply_bert(rel_df, layer, pool_func)
    return vsm.word_relatedness_evaluation(rel_df, pooled_df, -(vsm.cosine))


In [42]:
pool_func = vsm.mean_pooling
for val in range(1,4):
    layer = val
    pred_df, rho = evaluate_pooled_bert(dev_df, layer, pool_func)
    print(layer, rho)

TypeError: bad operand type for unary -: 'function'

Record of pooling func, number of layers, and resulting rho

| pooling func| layer | rho |
| --- | --- | --- |
| max | 1 | 0.2707496460162731 |
| max | 2 | 0.20702414483988724 |
| max | 3 | 0.17744729074571614 |
| mean | 1 | 0.2757425333620801 |
| mean | 2 | 0.217700456830832 |
| mean | 3 | 0.18617500500667575 |
| min | 1 | 0.28747309266119614 |
| min | 2 | 0.2211592952130484 | 
| min | 3 | 0.19272403506986122 |
| last | 1 | 0.26255946375943245 |
| last | 2 | 0.20210332109799414 | 
| last | 3 | 0.1720367373470963 |

# Applying Other Models with BERT

In [25]:
count_df = pd.read_csv(os.path.join(VSM_HOME, 'giga_window20-flat.csv.gz'), index_col=0)
yelp_df = pd.read_csv(os.path.join(VSM_HOME, 'yelp_window20-flat.csv.gz'), index_col=0)
combined_df = count_df + yelp_df
print(combined_df.shape) # sanity check this matrix addition

(6000, 6000)


In [35]:
def apply_ppmi(df):
    return vsm.pmi(df)

def evaluate(df):
    pred_df, rho = vsm.word_relatedness_evaluation(dev_df, df)
    print(rho)
    return rho

def ppmi_model():
    pred_df, rho = evaluate_pooled_bert(dev_df, 1, vsm.mean_pooling)
    pipeline = apply_ppmi(pred_df)
    return pipeline

In [36]:
ppmi = ppmi_model()
evaluate(ppmi)

             0         1         2         3         4         5         6    \
):     -0.576097  0.310341 -0.532733 -0.833050 -0.626199 -0.241390 -0.259917   
);     -0.056739  0.058793 -0.243109 -0.800296 -0.119222  0.232650  0.136607   
..     -0.271509 -0.009211 -0.190293 -0.275234 -0.276218 -0.113867 -0.033713   
...    -0.380597 -0.054661 -0.161327 -0.299695 -0.299188 -0.104158  0.040436   
:(     -0.425129  0.215213 -1.130576 -1.066704 -0.371664 -0.021424 -0.539762   
...          ...       ...       ...       ...       ...       ...       ...   
zero    0.368413  0.714821 -0.532399 -0.153238 -0.184203  0.009934  0.028351   
zinc    0.703263  0.839075 -0.582964 -0.137509  0.656363 -0.004265 -0.014998   
zombie  0.238683  0.051997 -0.199597 -0.463003  0.261023 -0.319047 -0.123983   
zone    0.226440  0.033647 -0.452572 -0.330314  0.337524 -0.364947 -0.073664   
zoo     0.658474  0.109369  0.496630 -0.400574  0.463277 -0.611583 -0.014404   

             7         8         9    .

TypeError: can only concatenate str (not "numpy.float64") to str

# Aggregated Approach

In [21]:
vsm_index = pd.read_csv(
    os.path.join(VSM_HOME, 'yelp_window5-scaled.csv.gz'),
    usecols=[0], index_col=0)
vocab = list(vsm_index.index)
vocab_ids = {w: vsm.hf_encode(w, bert_tokenizer)[0] for w in vocab}