<a href="https://colab.research.google.com/github/kayWHYdee/Computational-Psycholinguistics/blob/main/psychh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import gensim.downloader as api

w2v_model = api.load('word2vec-google-news-300')

# Save the model locally (Optional, but recommended)
w2v_model.save('/GoogleNews-vectors-negative300.bin')



In [None]:
import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from scipy.stats import spearmanr, rankdata
from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer

# -------------------------------------------------------------------
# 1. LOAD & PREPROCESS SPP DATA (prime, target, relatedness, rt)
# -------------------------------------------------------------------
def load_spp_data(csv_path):
    df = pd.read_csv(csv_path)
    df = df.rename(columns={
        'prime':       'prime',
        'target':      'target',
        'coding.RESP': 'relatedness',
        'target.RT':   'rt'
    })
    df = df[['prime', 'target', 'relatedness', 'rt']]
    df.dropna(subset=['prime','target','relatedness','rt'], inplace=True)
    # log‐transform RT if desired
    df['logrt'] = df['rt'].apply(lambda x: -np.log(x) if x > 0 else np.nan)
    # map relatedness to 0/1 if needed
    df['relatedness'] = df['relatedness'].map({1: 1, 2: 0})
    return df

# -------------------------------------------------------------------
# 2. LOAD STATIC & CONTEXTUAL EMBEDDING MODELS
# -------------------------------------------------------------------
def load_word2vec(path):
# Load the model using KeyedVectors.load()
    w2v = KeyedVectors.load(path, mmap='r')
    mean_vec = np.mean(w2v.vectors, axis=0)
    return w2v, mean_vec

def load_sbert(device='cpu'):
    return SentenceTransformer('all-mpnet-base-v2', device=device)

# -------------------------------------------------------------------
# 3. EMBEDDING & SIMILARITY UTILS
# -------------------------------------------------------------------
def get_w2v_embedding(word, w2v, mean_vec):
    return w2v[word] if word in w2v else mean_vec

def cosine_similarity(a, b):
    if a is None or b is None: return None
    na, nb = np.linalg.norm(a), np.linalg.norm(b)
    if na==0 or nb==0: return None
    return np.dot(a, b) / (na * nb)

# -------------------------------------------------------------------
# 4. COMPUTE SIMILARITIES & SPEARMAN CORRELATIONS
# -------------------------------------------------------------------
def compute_correlations(df, w2v, mean_vec, sbert):
    sims_w2v, sims_sbert, rts = [], [], []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Embedding & collecting"):
        p, t, rt = row['prime'], row['target'], row['rt']
        # static
        v_p = get_w2v_embedding(p.lower(), w2v, mean_vec)
        v_t = get_w2v_embedding(t.lower(), w2v, mean_vec)
        sim_w = cosine_similarity(v_p, v_t)
        # SBERT
        emb_p = sbert.encode(p, convert_to_numpy=True)
        emb_t = sbert.encode(t, convert_to_numpy=True)
        sim_s = cosine_similarity(emb_p, emb_t)
        if sim_w is None or sim_s is None:
            continue
        sims_w2v.append(sim_w)
        sims_sbert.append(sim_s)
        rts.append(rt)

    # rank‐normalize before Spearman
    rw2v = rankdata(sims_w2v)
    rsbert = rankdata(sims_sbert)
    rrt = rankdata(rts)

    print("Spearman (Word2Vec vs RT):", *spearmanr(rw2v, rrt))
    print("Spearman (SBERT vs RT):   ", *spearmanr(rsbert, rrt))
    return sims_w2v, sims_sbert

# -------------------------------------------------------------------
# 5. AMBIGUOUS-PRIMES ANALYSIS
# -------------------------------------------------------------------
def analyze_ambiguous_primes(df, ambiguous_words, w2v, mean_vec, sbert):
    records = []
    for prime in tqdm(ambiguous_words, desc="Primes"):
        subset = df[df['prime'] == prime]
        for _, row in tqdm(subset.iterrows(),
                           total=len(subset),
                           desc=f"Targets for '{prime}'",
                           leave=False):
            t = row['target']
            # static
            v_p = get_w2v_embedding(prime.lower(), w2v, mean_vec)
            v_t = get_w2v_embedding(t.lower(), w2v, mean_vec)
            sim_w = cosine_similarity(v_p, v_t)
            # SBERT
            emb_p = sbert.encode(prime, convert_to_numpy=True)
            emb_t = sbert.encode(t, convert_to_numpy=True)
            sim_s = cosine_similarity(emb_p, emb_t)
            records.append({
                'prime': prime,
                'target': t,
                'sim_w2v': sim_w,
                'sim_sbert': sim_s
            })
    return pd.DataFrame(records)

# -------------------------------------------------------------------
# 6. MAIN
# -------------------------------------------------------------------
if __name__ == "__main__":
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    spp_csv = "/SPP.csv"

    w2v_bin = "/GoogleNews-vectors-negative300.bin"  # Update with the correct path

    # load data & models
    df_spp       = load_spp_data(spp_csv)
    w2v_model, mv = load_word2vec(w2v_bin)
    sbert_model  = load_sbert(device=device)

    # correlations
    compute_correlations(df_spp, w2v_model, mv, sbert_model)

    # ambiguous primes
    ambigs = ["bank","bat","match","spinach","chance"]
    ambig_df = analyze_ambiguous_primes(df_spp, ambigs, w2v_model, mv, sbert_model)
    print(ambig_df.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding & collecting: 100%|██████████| 427471/427471 [3:15:54<00:00, 36.37it/s]


Spearman (Word2Vec vs RT): -0.023152080235850397 8.94182452417982e-52
Spearman (SBERT vs RT):    -0.018697974551064547 2.2577062921065268e-34


Primes:   0%|          | 0/5 [00:00<?, ?it/s]
Targets for 'bank': 0it [00:00, ?it/s][A
                                      [A
Targets for 'bat': 0it [00:00, ?it/s][A
                                     [A
Targets for 'match': 0it [00:00, ?it/s][A
Primes:  60%|██████    | 3/5 [00:00<00:00, 25.29it/s]
Targets for 'spinach': 0it [00:00, ?it/s][A
                                         [A
Targets for 'chance': 0it [00:00, ?it/s][A
Primes: 100%|██████████| 5/5 [00:00<00:00, 25.89it/s]


Empty DataFrame
Columns: []
Index: []


In [None]:
# !pip install gensim
!pip install numpy==1.25.2
# !pip install --force-reinstall gensim

Collecting numpy==1.25.2
  Downloading numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.3.1 requires numpy>=1.26, but you have numpy 1.25.2 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.25.2 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.25.2 which is incompatible.[0m[31m
[

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.optim import AdamW

from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load datasets
spp_data = pd.read_csv("/content/spp (1).csv")
raw_stimuli_data = pd.read_csv("/content/raw_stimuli.csv")
constraints_base = pd.read_csv("/content/constraints_bert-base-uncased.csv")
constraints_large = pd.read_csv("/content/constraints_bert-large-uncased.csv")

# Prepare word-level and sentence-level priming datasets
word_priming_data = spp_data[['target', 'prime', 'unrelated', 'relation']]
sentence_priming_data = raw_stimuli_data[['target', 'related', 'unrelated', 'relation', 'context']]

# Load BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Load pre-trained Skip-gram model (Google News Word2Vec)
skipgram_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/GoogleNews-vectors-negative300.bin', binary=True)

def get_skipgram_embedding(word):
    if word in skipgram_model:
        return skipgram_model[word]
    else:
        return np.zeros(skipgram_model.vector_size)  # Return zero vector for OOV words

# Word-Level Priming with BERT
word_X = []
word_y = []

for _, row in word_priming_data.iterrows():
    target_emb = get_bert_embedding(row['target'])
    prime_emb = get_bert_embedding(row['prime'])
    unrelated_emb = get_bert_embedding(row['unrelated'])

    word_X.append(np.concatenate((target_emb, prime_emb)))
    word_y.append(1)  # Related

    word_X.append(np.concatenate((target_emb, unrelated_emb)))
    word_y.append(0)  # Unrelated

word_X = np.array(word_X)
word_y = np.array(word_y)

X_train, X_test, y_train, y_test = train_test_split(word_X, word_y, test_size=0.2, random_state=42)

clf_word_bert = LogisticRegression(max_iter=1000)
clf_word_bert.fit(X_train, y_train)

y_pred = clf_word_bert.predict(X_test)
print("Word-Level Priming Classifier Accuracy (BERT):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Word-Level Priming with Context (BERT)
word_X_context = []
word_y_context = []

for _, row in spp_data.iterrows():
    target_emb = get_bert_embedding(f"{row['context_word']} {row['target']}")
    prime_emb = get_bert_embedding(f"{row['context_word']} {row['prime']}")
    unrelated_emb = get_bert_embedding(f"{row['context_word']} {row['unrelated']}")

    word_X_context.append(np.concatenate((target_emb, prime_emb)))
    word_y_context.append(1)  # Related

    word_X_context.append(np.concatenate((target_emb, unrelated_emb)))
    word_y_context.append(0)  # Unrelated

word_X_context = np.array(word_X_context)
word_y_context = np.array(word_y_context)

X_train, X_test, y_train, y_test = train_test_split(word_X_context, word_y_context, test_size=0.2, random_state=42)

clf_word_context = LogisticRegression(max_iter=1000)
clf_word_context.fit(X_train, y_train)

y_pred = clf_word_context.predict(X_test)
print("Word-Level Priming Classifier Accuracy with Context (BERT):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Sentence-Level Priming with BERT
sentence_X = []
sentence_y = []

for _, row in sentence_priming_data.iterrows():
    target_emb = get_bert_embedding(row['target'])
    related_context_emb = get_bert_embedding(row['context'].replace("[MASK]", row['related']))
    unrelated_context_emb = get_bert_embedding(row['context'].replace("[MASK]", row['unrelated']))

    sentence_X.append(np.concatenate((target_emb, related_context_emb)))
    sentence_y.append(1)  # Related

    sentence_X.append(np.concatenate((target_emb, unrelated_context_emb)))
    sentence_y.append(0)  # Unrelated

sentence_X = np.array(sentence_X)
sentence_y = np.array(sentence_y)

X_train, X_test, y_train, y_test = train_test_split(sentence_X, sentence_y, test_size=0.2, random_state=42)

clf_sentence_bert = LogisticRegression(max_iter=1000)
clf_sentence_bert.fit(X_train, y_train)

y_pred = clf_sentence_bert.predict(X_test)
print("Sentence-Level Priming Classifier Accuracy (BERT):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Incorporate constraints for additional features
constraints_X = []
constraints_y = []

for _, row in constraints_base.iterrows():
    target_emb = get_bert_embedding(row['target'])
    related_emb = get_bert_embedding(row['related'])
    unrelated_emb = get_bert_embedding(row['unrelated'])

    # Add constraint and entropy as additional features
    constraints_X.append(np.concatenate((target_emb, related_emb, [row['constraint'], row['entropy']])))
    constraints_y.append(1)  # Related

    constraints_X.append(np.concatenate((target_emb, unrelated_emb, [row['constraint'], row['entropy']])))
    constraints_y.append(0)  # Unrelated

constraints_X = np.array(constraints_X)
constraints_y = np.array(constraints_y)

X_train, X_test, y_train, y_test = train_test_split(constraints_X, constraints_y, test_size=0.2, random_state=42)

clf_constraints = LogisticRegression(max_iter=1000)
clf_constraints.fit(X_train, y_train)

y_pred = clf_constraints.predict(X_test)
print("Constraints-Based Classifier Accuracy (BERT):", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Define a custom dataset for fine-tuning
class PrimingDataset(Dataset):
    def __init__(self, targets, primes, labels, tokenizer):
        self.targets = targets
        self.primes = primes
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.targets[idx], self.primes[idx],
            truncation=True, padding='max_length', max_length=128, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Prepare the dataset
targets = word_priming_data['target'].tolist()
primes = word_priming_data['prime'].tolist() + word_priming_data['unrelated'].tolist()
labels = [1] * len(word_priming_data) + [0] * len(word_priming_data)

dataset = PrimingDataset(targets, primes, labels, bert_tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Fine-tune BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.to(device)



for epoch in range(3):  # Train for 3 epochs
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()

        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

print("Fine-tuning complete!")



cuda
Word-Level Priming Classifier Accuracy (BERT): 0.3195385001254076
              precision    recall  f1-score   support

           0       0.32      0.31      0.32      2003
           1       0.32      0.32      0.32      1984

    accuracy                           0.32      3987
   macro avg       0.32      0.32      0.32      3987
weighted avg       0.32      0.32      0.32      3987

Word-Level Priming Classifier Accuracy with Context (BERT): 0.3072485578128919
              precision    recall  f1-score   support

           0       0.31      0.30      0.30      2003
           1       0.31      0.31      0.31      1984

    accuracy                           0.31      3987
   macro avg       0.31      0.31      0.31      3987
weighted avg       0.31      0.31      0.31      3987



KeyboardInterrupt: 