# Preprocessing of claim-sample pairs using a spaCy pipeline.

Ref: 
- [spaCy linguistic features](https://spacy.io/usage/linguistic-features)
- [spaCy processing pipelines](https://spacy.io/usage/processing-pipelines)
- [spaCy custom components](https://spacy.io/usage/processing-pipelines#custom-components)

In [1]:
# Change the working directory to project root
import pathlib
import os
ROOT_DIR = pathlib.Path.cwd()
while not ROOT_DIR.joinpath("src").exists():
    ROOT_DIR = ROOT_DIR.parent
os.chdir(ROOT_DIR)

In [2]:
# Imports and dependencies
import pandas as pd
import numpy as np
import spacy
import torch
from sentence_transformers import SentenceTransformer, util
from src.torch_utils import get_torch_device
from src.spacy_utils import repl_special_token
from typing import Callable, Tuple
import copy
import re

random_seed = 42
np.random.seed(random_seed)
torch_device = get_torch_device()

Torch device is 'mps'


  from .autonotebook import tqdm as notebook_tqdm


## Select models

In [3]:
nlp = spacy.load("en_core_web_trf")
nlp

<spacy.lang.en.English at 0x1054cd610>

In [4]:
embedder = SentenceTransformer(
    "sentence-transformers/msmarco-bert-base-dot-v5",
    device=torch_device
)
embedder

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

## Load pair samples

Data need to shape $(n, 5)$ with columns:
- `claim`: claim id
- `claim_text`: claim text string
- `evidence`: evidence id
- `evidence_text`: evidence text string
- `related`: relation labels as `1/0`

In [5]:
train_data_file_path = \
    ROOT_DIR.joinpath("./result/train_data/train_claim_evidence_pair_rns.json")
with open(train_data_file_path, mode="r") as f:
    train_data = (
        pd.read_json(f, orient="records")
        .set_index(["claim", "evidence"])
    )
print(train_data.shape)
train_data.head(60)

(12366, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,claim_text,evidence_text,related
claim,evidence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim-1937,evidence-442946,Not only is there no scientific evidence that ...,At very high concentrations (100 times atmosph...,1
claim-1937,evidence-1194317,Not only is there no scientific evidence that ...,Plants can grow as much as 50 percent faster i...,1
claim-1937,evidence-12171,Not only is there no scientific evidence that ...,Higher carbon dioxide concentrations will favo...,1
claim-126,evidence-338219,El Niño drove record highs in global temperatu...,While ‘climate change’ can be due to natural f...,1
claim-126,evidence-1127398,El Niño drove record highs in global temperatu...,This acceleration is due mostly to human-cause...,1
claim-2510,evidence-530063,"In 1946, PDO switched to a cool phase.",There is evidence of reversals in the prevaili...,1
claim-2510,evidence-984887,"In 1946, PDO switched to a cool phase.","1945/1946: The PDO changed to a ""cool"" phase, ...",1
claim-2021,evidence-1177431,Weather Channel co-founder John Coleman provid...,There is no convincing scientific evidence tha...,1
claim-2021,evidence-782448,Weather Channel co-founder John Coleman provid...,"He has called global warming the ""greatest sca...",1
claim-2021,evidence-540069,Weather Channel co-founder John Coleman provid...,International Council of Academies of Engineer...,1


## Basic preprocessing exploration

In [6]:
def get_emb_similarity(claim_texts:list, evidence_texts:list) -> Tuple[float]:
    emb_kwargs = {"convert_to_tensor": True, "device": torch_device}
    claim_emb = embedder.encode(sentences=claim_texts, **emb_kwargs)
    evidence_emb = embedder.encode(sentences=evidence_texts, **emb_kwargs)
    score = util.dot_score(a=claim_emb, b=evidence_emb)
    return score

### Lower case

Impact of casting to lower case.

In [7]:
test_A = train_data.loc[("claim-2510", "evidence-984887")]
test_A

  test_A = train_data.loc[("claim-2510", "evidence-984887")]


Unnamed: 0_level_0,Unnamed: 1_level_0,claim_text,evidence_text,related
claim,evidence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim-2510,evidence-984887,"In 1946, PDO switched to a cool phase.","1945/1946: The PDO changed to a ""cool"" phase, ...",1


In [8]:
get_emb_similarity(
    claim_texts=test_A["claim_text"].tolist(),
    evidence_texts=test_A["evidence_text"].values.tolist()
)

tensor([[183.0319]], device='mps:0')

In [9]:
get_emb_similarity(
    claim_texts=test_A["claim_text"].str.lower().tolist(),
    evidence_texts=test_A["evidence_text"].str.lower().values.tolist()
)

tensor([[183.0319]], device='mps:0')

### Replace special tokens

In [11]:
test_nlp = copy.deepcopy(nlp)
test_nlp.add_pipe("repl_special_token", first=True)
test_nlp.pipe_names

['repl_special_token',
 'transformer',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

#### Test case: CO2, CO 2, carbon dioxide

In [20]:
test_A = train_data.loc[("claim-1937", "evidence-12171")]
test_A

  test_A = train_data.loc[("claim-1937", "evidence-12171")]


Unnamed: 0_level_0,Unnamed: 1_level_0,claim_text,evidence_text,related
claim,evidence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim-1937,evidence-12171,Not only is there no scientific evidence that ...,Higher carbon dioxide concentrations will favo...,1


In [21]:
get_emb_similarity(
    claim_texts=test_A["claim_text"].tolist(),
    evidence_texts=test_A["evidence_text"].values.tolist()
)

tensor([[171.6822]], device='mps:0')

In [29]:
test_B = test_A.copy()
cols = ["claim_text", "evidence_text"]
test_B[cols] = test_A[cols].applymap(lambda t: test_nlp(t).text)
test_B

Unnamed: 0_level_0,Unnamed: 1_level_0,claim_text,evidence_text,related
claim,evidence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
claim-1937,evidence-12171,Not only is there no scientific evidence that ...,Higher carbon dioxide concentrations will favo...,1


In [30]:
get_emb_similarity(
    claim_texts=test_B["claim_text"].tolist(),
    evidence_texts=test_B["evidence_text"].values.tolist()
)

tensor([[175.0602]], device='mps:0')