## Importing Libraries

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch
import logging
from tqdm import tqdm
logging.basicConfig(level=logging.ERROR)
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [3]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Setting up data

In [4]:
import json
with open("/content/drive/MyDrive/TUDelft/NLP Group Project/data/programfc/programfc_preprocessed.json") as f:
  train_data = json.load(f)
len(train_data)

2605

# BM25

In [5]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [6]:
import re
from rank_bm25 import BM25Okapi

In [7]:
with open("/content/drive/MyDrive/TUDelft/NLP Group Project/bm25/corpus_evidence_unified.json") as f:
  document_corpus = json.load(f)

In [8]:
def preprocess_text(text):
	text = text.lower()
	text = re.sub(r'\s\.\.\.', '', text)
	text = re.sub(r'\.', '', text)
	text = text.split(" ")
	return text

In [9]:
corpus = list(document_corpus.values())
tokenized_corpus = []
for doc in corpus:
	tokenized_corpus.append(preprocess_text(doc))

bm25 = BM25Okapi(tokenized_corpus)

## ABHISHEKS CODE

In [10]:
def run_bm25(query, top_k):
  results = bm25.get_top_n(preprocess_text(query), corpus, n=top_k)
  return results

In [11]:
def do_bm25(data, save_path, pick_up_from=None):

  features = []
  start_index = 2000
  if pick_up_from:
      with open(pick_up_from, 'r') as file:
          features = json.load(file)
          start_index = start_index+len(features)  # Determine where to start processing
          print(f"Picking up from {start_index} claims")

  def save_progress(features):
        with open(save_path, 'w') as file:
            json.dump(features, file, indent=2)
        print("saved progress to ", save_path)

  # for i, example in enumerate(tqdm(data)):
  for i in tqdm(range(start_index, len(data)), desc="Processing Claims"):

    claim = data[i]['claim']
    q1 = data[i]['q1']
    q2 = data[i]['q2']
    q3 = data[i]['q3']
    results = []
    top_k = 3

    if q2 != "":
      top_k = 2
      results.extend(run_bm25(q2, top_k))

    results.extend(run_bm25(q1, top_k))
    results.extend(run_bm25(q3, top_k))

    evidences = ' '.join(results)
    feature_item = {
            "claim_evidence_string": f"[Claim]:{claim} [Evidences]:{evidences}",
            "idx": data[i]['idx'],
            "taxonomy_label": data[i]['taxonomy_label'],
            "label": data[i]['label']

        }
    features.append(feature_item)


    if (len(features) - start_index) % 10 == 0:
                save_progress(features)

  save_progress(features)


In [13]:
train_features = do_bm25(data=train_data,
                          save_path = '/content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json'
                          # pick_up_from='None'
                        )

Processing Claims:   2%|▏         | 10/605 [01:11<1:09:13,  6.98s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:   3%|▎         | 20/605 [02:24<1:08:25,  7.02s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:   5%|▍         | 30/605 [03:21<1:02:59,  6.57s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:   7%|▋         | 40/605 [04:37<1:04:59,  6.90s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:   8%|▊         | 50/605 [05:47<1:08:53,  7.45s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  10%|▉         | 60/605 [06:51<1:06:00,  7.27s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  12%|█▏        | 70/605 [08:12<1:07:19,  7.55s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  13%|█▎        | 80/605 [09:17<52:46,  6.03s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  15%|█▍        | 90/605 [10:23<59:07,  6.89s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  17%|█▋        | 100/605 [11:40<1:12:20,  8.59s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  18%|█▊        | 110/605 [12:55<1:08:31,  8.31s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  20%|█▉        | 120/605 [14:08<1:02:59,  7.79s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  21%|██▏       | 130/605 [15:01<42:13,  5.33s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  23%|██▎       | 140/605 [16:15<55:14,  7.13s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  25%|██▍       | 150/605 [17:27<50:01,  6.60s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  26%|██▋       | 160/605 [18:47<54:40,  7.37s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  28%|██▊       | 170/605 [20:09<1:03:10,  8.71s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  30%|██▉       | 180/605 [21:16<51:32,  7.28s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  31%|███▏      | 190/605 [22:35<54:20,  7.86s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  33%|███▎      | 200/605 [23:41<43:28,  6.44s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  35%|███▍      | 210/605 [24:51<55:36,  8.45s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  36%|███▋      | 220/605 [25:58<45:20,  7.06s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  38%|███▊      | 230/605 [27:13<50:37,  8.10s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  40%|███▉      | 240/605 [28:20<39:20,  6.47s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  41%|████▏     | 250/605 [29:31<41:55,  7.08s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  43%|████▎     | 260/605 [30:50<42:30,  7.39s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  45%|████▍     | 270/605 [32:03<39:57,  7.16s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  46%|████▋     | 280/605 [33:08<35:09,  6.49s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  48%|████▊     | 290/605 [34:34<40:19,  7.68s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  50%|████▉     | 300/605 [35:51<38:04,  7.49s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  51%|█████     | 310/605 [37:10<38:42,  7.87s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  53%|█████▎    | 320/605 [38:10<30:42,  6.46s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  55%|█████▍    | 330/605 [39:20<29:35,  6.45s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  56%|█████▌    | 340/605 [40:34<38:18,  8.67s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  58%|█████▊    | 350/605 [41:44<31:46,  7.48s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  60%|█████▉    | 360/605 [43:08<35:43,  8.75s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  61%|██████    | 370/605 [44:15<23:11,  5.92s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  63%|██████▎   | 380/605 [45:24<28:54,  7.71s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  64%|██████▍   | 390/605 [46:50<31:16,  8.73s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  66%|██████▌   | 400/605 [48:05<24:21,  7.13s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  68%|██████▊   | 410/605 [49:14<20:48,  6.40s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  69%|██████▉   | 420/605 [50:32<22:12,  7.20s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  71%|███████   | 430/605 [51:54<22:14,  7.62s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  73%|███████▎  | 440/605 [53:11<18:49,  6.85s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  74%|███████▍  | 450/605 [54:18<16:27,  6.37s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  76%|███████▌  | 460/605 [55:26<16:55,  7.01s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  78%|███████▊  | 470/605 [56:45<18:12,  8.09s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  79%|███████▉  | 480/605 [57:56<14:11,  6.81s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  81%|████████  | 490/605 [59:21<19:03,  9.94s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  83%|████████▎ | 500/605 [1:00:29<13:33,  7.75s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  84%|████████▍ | 510/605 [1:01:44<12:08,  7.67s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  86%|████████▌ | 520/605 [1:02:54<10:00,  7.07s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  88%|████████▊ | 530/605 [1:04:09<10:05,  8.08s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  89%|████████▉ | 540/605 [1:05:42<09:48,  9.06s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  91%|█████████ | 550/605 [1:06:46<05:16,  5.75s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  93%|█████████▎| 560/605 [1:08:05<06:23,  8.53s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  94%|█████████▍| 570/605 [1:09:18<04:05,  7.02s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  96%|█████████▌| 580/605 [1:10:25<02:52,  6.91s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  98%|█████████▊| 590/605 [1:11:37<01:51,  7.40s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims:  99%|█████████▉| 600/605 [1:13:02<00:41,  8.25s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json


Processing Claims: 100%|██████████| 605/605 [1:13:40<00:00,  7.31s/it]

saved progress to  /content/drive/MyDrive/TUDelft/NLP Group Project/bm25_of_programfc/bm25_features_train_set.json



