# Install dependencies
We install pytorch with built-in CUDA support. If you don't have CUDA, you can install pytorch without CUDA support. You can find more information [here](https://pytorch.org/get-started/locally/).
Also we install transformers, pandas, mwparserfromhell, nltk, accelerate and nvidia-ml-py3.
We use mwparserfromhell to parse the raw text of the Wikipedia articles, nltk for tokenization, accelerate for multi-GPU training and nvidia-ml-py3 for GPU monitoring.

In [1]:
!pip install torch torchvision torchaudio --index-url https: // download.pytorch.org/whl/cu117
!pip install transformers pandas mwparserfromhell nltk accelerate nvidia-ml-py3 datasets

ERROR: Directory '//' is not installable. Neither 'setup.py' nor 'pyproject.toml' found.






# Download data from Wikipedia
We download each article about some specific category from Wikipedia. We use the category "Science fiction films" as an example. You can change the category to any other category you want. We also remove the references and external links sections from the articles and wiki markup and save the result to a CSV file.

In [2]:
import numpy as np
import os
from typing import List, Tuple, Dict
import mwparserfromhell
import pandas as pd
import requests
from tqdm.auto import tqdm

# Set the category you want to download
csv_filename = "../data/articles.csv"
articles_category = 'Science_fiction_films'

# Get the list of subcategories and articles
def get_category_members(category: str, member_type: str) -> List[str]:
    base_url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'list': 'categorymembers',
        'cmtitle': category,
        'cmtype': member_type,
        'format': 'json',
        'cmlimit': 500
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    for item in data['query']['categorymembers']:
        yield item['title']

# Get the raw text of the articles
def get_article_texts(articles: List[str]) -> Dict[str, str]:
    base_url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json',
        'titles': '|'.join(articles)
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    for page in data['query']['pages'].values():
        yield page['revisions'][0]['*']

# Download the articles by chunks of 50 articles
def download_articles(category: str):
    # Get the list of subcategories and articles
    subcategories = list(get_category_members(f'Category:{category}', 'subcat'))
    all_articles = []

    # Download the articles from the subcategories
    for subcategory in tqdm(subcategories, desc="Downloading subcategories"):
        articles = list(get_category_members(subcategory, 'page'))
        all_articles.extend(articles)

    # Download the articles by chunks of 50 articles
    for i in tqdm(range(0, len(all_articles), 50), desc="Downloading articles"):
        batch = all_articles[i:i + 50]
        texts = dict(zip(batch, get_article_texts(batch)))
        for title, raw_text in texts.items():
            wikicode = mwparserfromhell.parse(raw_text)
            # Remove the references and external links sections
            for section in wikicode.get_sections(levels=[2]):
                if section.filter_headings()[0].title.strip().lower() in ["references", "external links"]:
                    wikicode.remove(section)
            text = wikicode.strip_code().strip()
            yield {'title': title, 'raw_text': raw_text, 'text': text}

# Download the articles and save them to a CSV file
if not os.path.exists(csv_filename):
    articles_df = pd.DataFrame(download_articles(articles_category))
    articles_df = articles_df.dropna(subset=['text'])
    articles_df.to_csv(csv_filename, index=False)
else:
    # If the CSV file already exists, we just load it
    articles_df = pd.read_csv(csv_filename)
    articles_df.to_csv(csv_filename, index=False)

# Generate question-answer pairs
We use T5 to generate question-answer pairs from the Wikipedia articles. We use the T5-small model and the T5 tokenizer.


In [1]:
from accelerate import Accelerator
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset
from tqdm.auto import tqdm
import mwparserfromhell
import pandas as pd

# Load the model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

# Prepare the model for distributed training
accelerator = Accelerator()
model, tokenizer = accelerator.prepare(model, tokenizer)
device = accelerator.device

# Load the dataset
csv_filename = "../data/articles.csv"
articles_df = pd.read_csv(csv_filename)
articles_dataset = Dataset.from_pandas(articles_df)
csv_questions_filename = "../data/questions.csv"

# Make a query to the model and return the answer
def make_query_to_model(query) -> str:
    input_ids = tokenizer(query, return_tensors="pt").input_ids.to(device)

    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate and evaluate questions and answers
def generate_and_evaluate_questions_and_answers(row):
    text_df = pd.DataFrame(columns=["question", "answer"])
    parsed_text = mwparserfromhell.parse(row["raw_text"])
    for section in parsed_text.get_sections(levels=[2]):
        if section.filter_headings()[0].title.strip().lower() in ["references", "external links"]:
            continue
        question = make_query_to_model(f"Ask the large question about following: {row['title']} - {section.strip_code().strip()}")
        answer = make_query_to_model(f"Answer the question and provide some details. Question: {question}. Context: Article about {row['title']} - {section.strip_code().strip()}")
        if (len(question) < 50) or (len(answer) < 50):
            continue
        text_df = pd.concat([text_df, pd.DataFrame({"question": [question], "answer": [answer]})])
        print(f"Question: {question}\nAnswer: {answer}\n\n")
    return text_df

# Create a new dataframe to store the generated questions and answers
question_answers_df = pd.DataFrame(columns=["question", "answer", "score"])

if not os.path.exists(csv_questions_filename):
    # Generate the questions and answers and save them to a CSV file
    rows = tqdm(
        articles_df.iterrows(),
        desc="Generating questions and answers",
        total=len(articles_df)
    )
    for index, row in rows:
        created_question_answers = generate_and_evaluate_questions_and_answers(row)
        question_answers_df = pd.concat([question_answers_df, created_question_answers])
    # Save the generated questions and answers to a CSV file
    question_answers_df.to_csv(csv_questions_filename, index=False)
else:
    # If the CSV file already exists, we just load it
    question_answers_df = pd.read_csv(csv_questions_filename)

Generating questions and answers:   0%|          | 0/1366 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1594 > 512). Running this sequence through the model will result in indexing errors


Question: Bird began writing its screenplay in earnest the next year; he attempted to distinguish the script from the breadth of superhero-related content released since the first film, focusing on the family dynamics rather than the superhero genre.
Answer: Bird began writing its screenplay in earnest the next year; he attempted to distinguish the script from the breadth of superhero-related content released since the first film, focusing on the family dynamics rather than the superhero genre.


Question: Highlights: Avatar (2009) and Alita: Battle Angel (2019) were both films to be shot using 3D technology. The film received strong critical acclaim and became the highest-grossing film of all time in the United States and Canada, becoming the highest-grossing film of all time in the United States and Canada, surpassing Titanic. The film earned a fair $108 million at the worldwide box office.
Answer: Avatar (2009) and Alita: Battle Angel (2019) were both films to be shot using 3D techn

OutOfMemoryError: CUDA out of memory. Tried to allocate 8.74 GiB (GPU 0; 6.00 GiB total capacity; 466.23 MiB already allocated; 2.99 GiB free; 1.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF