## Importing Libraries

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
import logging
from tqdm import tqdm
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Setting up data

In [3]:
import json
with open("/content/drive/MyDrive/TUDelft/NLP Group Project/data/filtered_decomposed_claims.json") as f:
  train_data = json.load(f)
len(train_data)

2576

# BM25

In [4]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [5]:
import re
from rank_bm25 import BM25Okapi

In [6]:
with open("/content/drive/MyDrive/TUDelft/NLP Group Project/bm25/corpus_evidence_unified.json") as f:
  document_corpus = json.load(f)

In [7]:
def preprocess_text(text):
	text = text.lower()
	text = re.sub(r'\s\.\.\.', '', text)
	text = re.sub(r'\.', '', text)
	text = text.split(" ")
	return text

In [8]:
corpus = list(document_corpus.values())
tokenized_corpus = []
for doc in corpus:
	tokenized_corpus.append(preprocess_text(doc))

bm25 = BM25Okapi(tokenized_corpus)

## ABHISHEKS CODE

In [47]:
def run_bm25(query, top_k):
  results = bm25.get_top_n(preprocess_text(query), corpus, n=top_k)
  return results

In [48]:
def do_bm25(data, save_path, pick_up_from=None):

  features = []
  start_index = 0
  if pick_up_from:
      with open(pick_up_from, 'r') as file:
          features = json.load(file)
          start_index = start_index+len(features)  # Determine where to start processing
          print(f"Picking up from {start_index} claims")

  def save_progress(features):
        with open(save_path, 'w') as file:
            json.dump(features, file, indent=2)
        print("saved progress to ", save_path)

  # for i, example in enumerate(tqdm(data)):
  for i in tqdm(range(start_index, len(data)), desc="Processing Claims"):

    claim = data[i]['claim']
    q1 = data[i]['q1']
    q2 = data[i]['q2']
    q3 = data[i]['q3']
    results = []
    top_k = 3

    if q2 != "":
      top_k = 2
      results.extend(run_bm25(q2, top_k))

    results.extend(run_bm25(q1, top_k))
    results.extend(run_bm25(q3, top_k))

    evidences = ' '.join(results)
    feature_item = {
            "claim_evidence_string": f"[Claim]:{claim} [Evidences]:{evidences}",
            "idx": data[i]['idx'],
            "taxonomy_label": data[i]['taxonomy_label'],
            "label": data[i]['label']

        }
    features.append(feature_item)


    if (len(features) - start_index) % 10 == 0:
                save_progress(features)

  save_progress(features)


In [49]:
train_features = do_bm25(data=train_data,
                          save_path = '/content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json',
                          pick_up_from='/content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json'
                        )

Picking up from 146 claims


Processing Claims:   0%|          | 10/2430 [01:43<6:47:21, 10.10s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   1%|          | 20/2430 [03:36<7:21:08, 10.98s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   1%|          | 30/2430 [05:05<5:33:13,  8.33s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   2%|▏         | 40/2430 [06:42<5:50:54,  8.81s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   2%|▏         | 50/2430 [08:28<7:00:49, 10.61s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   2%|▏         | 60/2430 [09:57<5:49:01,  8.84s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   3%|▎         | 70/2430 [11:28<6:20:40,  9.68s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   3%|▎         | 80/2430 [13:07<6:04:58,  9.32s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   4%|▎         | 90/2430 [14:47<6:42:42, 10.33s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   4%|▍         | 100/2430 [16:31<6:59:02, 10.79s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   5%|▍         | 110/2430 [18:10<7:18:58, 11.35s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   5%|▍         | 120/2430 [19:42<5:19:54,  8.31s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   5%|▌         | 130/2430 [21:18<5:51:07,  9.16s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   6%|▌         | 140/2430 [22:56<6:32:56, 10.30s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   6%|▌         | 150/2430 [24:32<5:49:13,  9.19s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   7%|▋         | 160/2430 [26:05<5:38:07,  8.94s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   7%|▋         | 170/2430 [27:36<5:57:31,  9.49s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   7%|▋         | 180/2430 [29:11<6:01:32,  9.64s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   8%|▊         | 190/2430 [30:52<6:03:04,  9.73s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   8%|▊         | 200/2430 [32:38<6:08:19,  9.91s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   9%|▊         | 210/2430 [34:14<5:46:04,  9.35s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   9%|▉         | 220/2430 [36:00<6:04:59,  9.91s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:   9%|▉         | 230/2430 [37:31<5:14:06,  8.57s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  10%|▉         | 240/2430 [39:04<5:53:14,  9.68s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  10%|█         | 250/2430 [40:49<6:29:13, 10.71s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  11%|█         | 260/2430 [42:27<5:57:36,  9.89s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  11%|█         | 270/2430 [44:06<6:03:27, 10.10s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  12%|█▏        | 280/2430 [45:54<7:34:57, 12.70s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  12%|█▏        | 290/2430 [47:30<5:42:21,  9.60s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  12%|█▏        | 300/2430 [49:03<5:30:52,  9.32s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  13%|█▎        | 310/2430 [50:37<5:09:16,  8.75s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  13%|█▎        | 320/2430 [52:06<5:04:24,  8.66s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  14%|█▎        | 330/2430 [53:39<5:36:10,  9.61s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  14%|█▍        | 340/2430 [55:18<5:28:31,  9.43s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  14%|█▍        | 350/2430 [56:59<5:48:31, 10.05s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  15%|█▍        | 360/2430 [58:41<6:05:57, 10.61s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  15%|█▌        | 370/2430 [1:00:15<5:38:53,  9.87s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  16%|█▌        | 380/2430 [1:01:47<5:25:57,  9.54s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  16%|█▌        | 390/2430 [1:03:25<5:12:22,  9.19s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  16%|█▋        | 400/2430 [1:04:57<5:57:30, 10.57s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  17%|█▋        | 410/2430 [1:06:30<4:58:03,  8.85s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  17%|█▋        | 420/2430 [1:08:11<6:22:22, 11.41s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  18%|█▊        | 430/2430 [1:09:44<4:45:36,  8.57s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  18%|█▊        | 440/2430 [1:11:06<4:35:18,  8.30s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  19%|█▊        | 450/2430 [1:12:38<5:09:46,  9.39s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  19%|█▉        | 460/2430 [1:14:15<5:52:59, 10.75s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  19%|█▉        | 470/2430 [1:15:45<4:25:54,  8.14s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  20%|█▉        | 480/2430 [1:17:13<4:47:53,  8.86s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  20%|██        | 490/2430 [1:18:56<4:56:18,  9.16s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  21%|██        | 500/2430 [1:20:48<5:32:12, 10.33s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  21%|██        | 510/2430 [1:22:33<5:51:08, 10.97s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  21%|██▏       | 520/2430 [1:24:07<5:00:36,  9.44s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  22%|██▏       | 530/2430 [1:25:37<4:51:44,  9.21s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  22%|██▏       | 540/2430 [1:27:14<5:20:56, 10.19s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  23%|██▎       | 550/2430 [1:28:48<4:32:05,  8.68s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  23%|██▎       | 560/2430 [1:30:17<4:42:01,  9.05s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  23%|██▎       | 570/2430 [1:31:56<5:23:45, 10.44s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_subclaims/bm25_features_train_set.json


Processing Claims:  23%|██▎       | 571/2430 [1:32:10<5:00:07,  9.69s/it]


KeyboardInterrupt: 