In [None]:
!pip install ./transformers
!pip install tensorboardX

[31mERROR: Invalid requirement: './transformers'
Hint: It looks like a path. File './transformers' does not exist.[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorboardX
  Downloading tensorboardX-2.6-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6


In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import re
# from bs4 import BeautifulSoup

from datasets import DatasetDict, Dataset, load_dataset
from accelerate import Accelerator
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# We can see all columns in df.head() / and .tail()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.max_colwidth', 1000)

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

train_dataid = '1c206RpN1YCecrL6Hvjl52a3R16Lf3cO1'
val_dataid = '1X0RXXyTaaSUqJgbiuNzTJkB0ryZhkdwo'
test_dataid = '1Xp7zTJOVV3ZeEaqEcfl1MziJY7JcxoJh'

download = drive.CreateFile({'id': train_dataid})
download.GetContentFile('train_og.json')

download = drive.CreateFile({'id': val_dataid})
download.GetContentFile('val_og.json')

download = drive.CreateFile({'id': test_dataid})
download.GetContentFile('test_og.json')

# Convert original data to desired format for QA

In [None]:
#create two dataframes, textual and visual df
def create_text_visual_df(dataset_type):
  f = open('{dataset}_og.json'.format(dataset=dataset_type), 'r')

  read_data = json.loads(f.read())
  textual_json = [x for x in read_data['data'] if x['task'] == 'textual_cloze']
  visual_json = [x for x in read_data['data'] if x['task'] == 'visual_coherence']

  textual_df = pd.DataFrame(textual_json)
  textual_keep_col = ['recipe_id', 'context', 'choice_list', 'answer', 'question']
  textual_df = textual_df[textual_keep_col]

  visual_df = pd.DataFrame(visual_json)
  visual_keep_col = ['recipe_id', 'context']
  visual_df = visual_df[visual_keep_col]

  return textual_df, visual_df

#combine all steps into a list
def combine_all_steps(row): 
  all_steps = []
  num_steps = len(row.context)
  for step in range(num_steps):
    all_steps.append(row.context[step]['title']) #no need step number
  return all_steps

#combine textual and visual data to create combined data for full information
def combine_text_visual_df(dataset_type):
  textual_df, visual_df = create_text_visual_df(dataset_type) #replace w dataset type
  visual_df['all_steps'] = visual_df.apply(lambda row: combine_all_steps(row), axis=1)
  combined_data = pd.merge(textual_df, visual_df, how='inner', on=['recipe_id'])
  combined_data.rename(columns={'context_x': 'context'}, inplace=True)
  combined_data = combined_data[['recipe_id', 'context', 'choice_list', 'answer', 'question', 'all_steps']]
  combined_data.question = combined_data.question.apply(lambda x: [i.replace('@placeholder', '_') if i == '@placeholder' else i for i in x])
  return combined_data

#generate questions
def generate_questions(row):
  create_question = ""
  given_question = row['question']
  all_steps = row['all_steps']
  target_index = given_question.index('_') #index of question in the given list
  if target_index == 0:
    #check if it's the first step in full steps
    temp_idx = all_steps.index(given_question[target_index+1])
    if temp_idx == 1:
      create_question = "What is the first step?"
    else:
      create_question =  "What is the step after " + all_steps[temp_idx-2] + " ?"
  elif target_index == 3:
    if all_steps.index(given_question[target_index-1]) == (len(all_steps)-2): #check if the question step is the last step
      create_question = "What is the last step?"
    else:
      create_question = "What is the step after " + given_question[target_index - 1] + " ?"
  else:
    create_question = "What is the step after " + given_question[target_index - 1] + " ?"
     

  # else:
  #   create_questions = "What is the step after " + all_steps[]
    # if target_index == 0:
    #   create_question = "What is the step before " + given_question[target_index + 1] + "?"
    # else:
    #   create_question = "What is the step after " + given_question[target_index-1] + "?"

  return create_question

#generate context
def generate_full_instruction(row):
  full_instruction = ""
  context = row["context"]
  steps = row["all_steps"]
  # given_question = row['question']
  # target_index = given_question.index('_')
  
  for step in range(len(steps)):
    if step == 0:
      full_instruction += "The first step is " + str(steps[step]) + ": " + context[step]['body'] + ". "
    elif step == (len(steps)-1):
      full_instruction += "The last step is " + str(steps[step]) + ": " + context[step]['body'] + ". "
    else:
      # full_instruction += "After " + str(steps[step-1]) + " is step - " + str(steps[step]) + ": " + context[step]['body'] + ". "
      full_instruction += "After the previous step is " + str(steps[step]) + ": " + context[step]['body'] + ". "
   #clean instruction 
  full_instruction = re.sub('\s+', ' ', full_instruction).strip()
  full_instruction = re.sub(r'\n', ' ', full_instruction)
  return full_instruction[0:-1]

#generate answers
def generate_answer_and_index(row):
  actual_answer = {}
  idx_list = []
  answer = row["choice_list"][row.answer]
  actual_answer["text"] = [answer]
  full_instruction = row.full_instruction

  #there are more than 1 answers, in training, only allow 1 answer! This code is only applied for training set to make sure there is one answer
  # answer_helper = "step - " + answer
  # answer_helper_idx = answer_helper.find(answer)
  # actual_answer["answer_start"] = [full_instruction.find(answer_helper) + answer_helper_idx - len('step - ')]

  #find all possible answers
  # found_answer_idx = full_instruction.find(answer)
  # idx_list.append(found_answer_idx)
  # while found_answer_idx != -1:
  #   new_start = found_answer_idx + len(answer)
  #   full_instruction = full_instruction[new_start::]
  #   found_answer_idx = full_instruction.find(answer)
  #   if found_answer_idx != -1:
  #     idx_list.append(found_answer_idx)

  actual_answer["answer_start"] = [full_instruction.find(answer)]
  return actual_answer

#combine all functions to make final data
#parameter: dataset_type: this function applies to all train/val/test set
def make_final_data(dataset_type):
  combine_data = combine_text_visual_df(dataset_type) #replace datasettype
  combine_data['full_instruction'] = combine_data.apply(lambda row: generate_full_instruction(row), axis=1)
  combine_data['new_question'] = combine_data.apply(lambda row: generate_questions(row), axis=1)
  combine_data['actual_answer'] = combine_data.apply(lambda row: generate_answer_and_index(row), axis=1)
  # combine_data['full_instruction'] = combine_data['full_instruction'].apply(lambda x: x.replace("step - ", ""))
  dup_check = combine_data[['recipe_id', 'full_instruction', 'new_question']]
  combine_data = combine_data[dup_check.duplicated() == False].reset_index(drop=True)
  final_data = combine_data[['recipe_id', 'full_instruction', 'new_question', 'actual_answer']].reset_index()
  final_data.rename(columns={'index':'id', 'recipe_id':'title', 'full_instruction':'context', 'new_question':'question', 'actual_answer':'answers'}, inplace=True)
  return final_data

In [None]:
train_df = make_final_data('train')
val_df = make_final_data('val')
test_df = make_final_data('test')

#can create a class for the train test val to apply all the converting step

In [None]:
context = "The first step is Veggie Prep: Tip #1 Remove seeds from jalapeños to keep it more on a medium spicy side. First prepare your veggies. Dice up tomatoes, carrots, cilantro, and romaine lettuce, set aside. Next slice up red onion and a jalapeño in half, place onto greased heated skillet and grill until softened.. After the previous step is Sauce Prep: Tip #2 Sharpen your chef’s knife before each use. Once grilled through, remove veggies from skillet and place into food processor. While you begin to make the sauce, place veggie burger onto already heated skillet, mince it up using a spatula, and grill for about five minutes until hot. Now back to the sauce…add in about a tablespoon of cilantro, 2 tablespoons of ranch, and a squeeze of lemon into the processor and blend until well incorporated. For added spice throw in a dash of Sriracha!. After the previous step is Assemble..: Tip #3 With a good sauce taco dinners go from boring to wow! Once veggie burger is cooked remove from skillet and set aside. Now place a tortilla on the skillet and grill until lightly brown on each side. Lets assemble! Place tortilla onto plate, next add on minced burger, lettuce, diced tomato, carrots, and garnish with the Kickin’ Ranch Sauce.. The last step is Suggestions!: Roll it up and serve!Suggestions: Serve with a side of quinoa for a filling side. Use the Kickin’ Ranch Sauce on any Mexican entree! Serve with a skinny margarita to complete your healthy Mexican meal ;) ***For this recipe and more like this please visit my food blog at Everythingbutfish.tumblr.com"
answer = 'Suggestions!'

print(context.find(answer))

1252


In [None]:
for i in train_df.columns:
  print(train_df[i][5250])

5250
chili-bean-rollup-with-a-kickin-ranch-sauce
The first step is Veggie Prep: Tip #1 Remove seeds from jalapeños to keep it more on a medium spicy side. First prepare your veggies. Dice up tomatoes, carrots, cilantro, and romaine lettuce, set aside. Next slice up red onion and a jalapeño in half, place onto greased heated skillet and grill until softened.. After the previous step is Sauce Prep: Tip #2 Sharpen your chef’s knife before each use. Once grilled through, remove veggies from skillet and place into food processor. While you begin to make the sauce, place veggie burger onto already heated skillet, mince it up using a spatula, and grill for about five minutes until hot. Now back to the sauce…add in about a tablespoon of cilantro, 2 tablespoons of ranch, and a squeeze of lemon into the processor and blend until well incorporated. For added spice throw in a dash of Sriracha!. After the previous step is Assemble..: Tip #3 With a good sauce taco dinners go from boring to wow! Once

In [None]:
train = Dataset.from_pandas(train_df)
val = Dataset.from_pandas(val_df)
test = Dataset.from_pandas(test_df) 

full_dataset = DatasetDict({'train': train, 'val': val, 'test': test})

#Preprocessing

In [None]:
import os
import torch
import time
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import (
    AlbertConfig,
    AlbertForQuestionAnswering,
    AlbertTokenizer,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample

from transformers.data.metrics.squad_metrics import compute_predictions_logits

# READER NOTE: Set this flag to use own model, or use pretrained model in the Hugging Face repository
use_own_model = True

if use_own_model:
  model_name_or_path = "textattack/albert-base-v2-imdb"

output_dir = ""

# Config
n_best_size = 1
max_answer_length = 30
do_lower_case = True
null_score_diff_threshold = 0.0

def to_list(tensor):
    return tensor.detach().cpu().tolist()

# Setup model
config_class, model_class, tokenizer_class = (
    AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer)
config = config_class.from_pretrained(model_name_or_path)
tokenizer = tokenizer_class.from_pretrained(
    model_name_or_path, do_lower_case=True)
model = model_class.from_pretrained(model_name_or_path, config=config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

processor = SquadV2Processor()

def run_prediction(question_texts, context_text):
    """Setup function to compute predictions"""
    examples = []

    for i, question_text in enumerate(question_texts):
        example = SquadExample(
            qas_id=str(i),
            question_text=question_text,
            context_text=context_text,
            answer_text=None,
            start_position_character=None,
            title="Predict",
            is_impossible=False,
            answers=None,
        )

        examples.append(example)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1,
    )

    eval_sampler = SequentialSampler(full_dataset)
    eval_dataloader = DataLoader(full_dataset, sampler=eval_sampler, batch_size=10)

    all_results = []

    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)

                output = [to_list(output[i]) for output in outputs]

                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)
                all_results.append(result)

    output_prediction_file = "predictions.json"
    output_nbest_file = "nbest_predictions.json"
    output_null_log_odds_file = "null_predictions.json"

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )

    return predictions

Some weights of the model checkpoint at textattack/albert-base-v2-imdb were not used when initializing AlbertForQuestionAnswering: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at textattack/albert-base-v2-imdb and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Why is model conversion important?',
    'context': 'The option to convert models between FARM and transformers gives freedom to the user and let people easily switch between frameworks.'
}
res = nlp(QA_input)

# b) Load model & tokenizer
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
res 

{'score': 0.21171392500400543,
 'start': 59,
 'end': 84,
 'answer': 'gives freedom to the user'}