This notebook focuses on extracting impact sentences from the processed data by using BART large MNLI model (https://huggingface.co/facebook/bart-large-mnli). The results are the entailment score of each sentence and their corresonding labels, which can be found in both .json and .xlsx formats.

Please ensure to paste the input_path, which is the location of 0_input.ipynb file, at the start of this notebook. This step is the only requirement to load all necessary information for the execution of the code.

Recommended Google Colab Runtime Type: A100 GPU (preferred) or V100 GPU, as this notebook involves running machine learning models.

In [None]:
# Input file path (must navigate at the beginning of each file)
input_path =  "/content/drive/My Drive/ImpactDataMining/Hurricane_Ian/Result"

All the below sections automatically retrieve data from the 0_input.ipynb file, as well as results from previous notebooks in this series. The code is designed to run using this information, so no further edits are required beyond this point.

In [None]:
!pip install transformers

import os
import json
import numpy as np
import pandas as pd
import torch

from google.colab import drive
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer



In [None]:
import time

start_time = time.time()

In [None]:
def current_path():
  print("Current working directory")
  print(os.getcwd())
  print()

current_path()
drive.mount('/content/drive')
os.chdir(input_path)
current_path()

Current working directory
/content

Mounted at /content/drive
Current working directory
/content/drive/My Drive/ResilienceDataMining/Hurricane_Ian/Result



In [None]:
with open('0_input.json', 'r') as file:
    data = json.load(file)
    result_path = data['result_path']
    labels = data['keywords']
    batch_size = data['batch_size']

In [None]:
os.makedirs(result_path, exist_ok=True)
os.chdir(result_path)
current_path()

Current working directory
/content/drive/My Drive/ResilienceDataMining/Hurricane_Ian/Result



In [None]:
with open('1_results.json', 'r') as file:
    data = json.load(file)
    text_body = data['text_body']
    text_table = data['text_table']

In [None]:
data = text_body + text_table

In [None]:
# Load the model and tokenizer
nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nli_model = nli_model.to(device)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)

In [None]:
use_empty_cache = True
concat_probs = torch.empty(0)
for batch in dataloader:
  with torch.no_grad():
    encoded_inputs = tokenizer.batch_encode_plus([(s, f'This example is {l}.') for s in batch for l in labels],
                                                 return_tensors='pt', padding=True, truncation=True)
    logits = nli_model(**encoded_inputs.to(device))[0]
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:, 1]
    prob_label_is_true = prob_label_is_true.reshape(len(batch), len(labels))

    concat_probs = concat_probs.to('cuda:0')
    concat_probs = torch.cat([concat_probs, prob_label_is_true], dim=0)

    # Free memory
    if use_empty_cache:
      del encoded_inputs, logits, entail_contradiction_logits, probs, prob_label_is_true
      torch.cuda.empty_cache()

In [None]:
idx = concat_probs.argmax(dim=1)
result_probs = concat_probs[torch.arange(concat_probs.shape[0]), idx]
result_labels = [labels[i] for i in idx]

In [None]:
# Saving results to an excel file
df1 = pd.DataFrame(concat_probs.cpu().numpy(), columns=labels)
df2 = pd.DataFrame(result_probs.cpu().numpy())
df3 = pd.DataFrame(result_labels)

with pd.ExcelWriter('2a_results.xlsx', engine='openpyxl') as writer:
    df1.to_excel(writer, sheet_name='2D_prob_tensor', index=False)
    df2.to_excel(writer, sheet_name='1D_prob_tensor', index=False, header=False)
    df3.to_excel(writer, sheet_name='1D_label_tensor', index=False, header=False)

In [None]:
# Saving results to a JSON file
with open('2a_results.json', 'w') as file:
    json.dump(
        {'sent_all': data, 'labels': labels,
         'result_probs': result_probs.cpu().numpy().tolist(),
         'result_labels': result_labels}
        , file
        )

In [None]:
end_time = time.time()
execution_time = end_time - start_time

print("Execution time:", execution_time, "seconds")

Execution time: 120.692312002182 seconds
