In [1]:
import os
import subprocess
import json

**Q&A from insights**

In [5]:
import os
import json
import subprocess
import re

# Use absolute path to avoid FileNotFoundError
base_dir = os.path.abspath("..")
data_dir = os.path.join(base_dir, "data")
output_dir = os.path.join(base_dir, "outputs", "insights_qa")
os.makedirs(output_dir, exist_ok=True)

# Check if data directory exists
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Data folder not found at: {data_dir}")

prompt_template = """
You are an expert in football analysis. Generate 5 structured question-answer pairs from the following match report.
Use concise and clear answers. Output the result as a JSON list with keys 'question' and 'answer'.

Match Report:
{report_text}
"""

def parse_qa_pairs(text):
    # Regular expression to match Q&A pairs with "Question" and "Answer" prefixes
    qa_pattern = re.compile(r"Question (\d+): (.+)\nAnswer: (.+)")
    matches = qa_pattern.findall(text)

    qa_pairs = []
    for match in matches:
        qa_pairs.append({
            "question": match[1].strip(),
            "answer": match[2].strip()
        })
    return qa_pairs

def generate_qa_from_text(text):
    prompt = prompt_template.format(report_text=text)

    try:
        result = subprocess.run(
            ["ollama", "run", "llama2"],
            input=prompt.encode('utf-8'),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=120
        )
        output = result.stdout.decode("utf-8")
        print("Subprocess output:", output)  # Debugging statement
        parsed_pairs = parse_qa_pairs(output)
        print("Parsed Q&A pairs:", parsed_pairs)  # Debugging statement
        return parsed_pairs

    except subprocess.TimeoutExpired:
        print("Timeout reached while generating response")
    except Exception as e:
        print(f"An error occurred: {e}")
    return None

for folder in os.listdir(data_dir):
    match_path = os.path.join(data_dir, folder)
    if not os.path.isdir(match_path):
        continue

    txt_files = [f for f in os.listdir(match_path) if f.endswith(".txt")]
    if not txt_files:
        print(f"No report.txt found in {folder}")
        continue

    with open(os.path.join(match_path, txt_files[0]), "r", encoding="utf-8") as f:
        report_text = f.read()

    print(f"Generating Q&A for: {folder}")
    qa_pairs = generate_qa_from_text(report_text)

    if qa_pairs:
        out_path = os.path.join(output_dir, f"{folder}.json")
        with open(out_path, "w", encoding="utf-8") as out_f:
            json.dump(qa_pairs, out_f, indent=2, ensure_ascii=False)
        print(f"Saved: {out_path}")
    else:
        print(f"Skipped: {folder}")


Generating Q&A for: 1_Arg_vs_Saudi_G1
Subprocess output: Here are 5 structured question-answer pairs based on the provided match report:

Question 1: What was the final score in the match between Saudi Arabia and Argentina?
Answer: The final score was 2-1 in favor of Saudi Arabia.

Question 2: How did Lionel Messi perform in the match for Argentina?
Answer: Lionel Messi scored a penalty kick for Argentina in the first half, but Saudi Arabia was able to come back and defeat them.

Question 3: What was the key to Saudi Arabia's victory?
Answer: Saudi Arabia's ability to defend against Argentina's attacking runs and convert their chances into goals was the key to their victory.

Question 4: How did Argentina react to falling behind?
Answer: Argentina pushed forward in search of a way back into the game, but Saudi Arabia's defense held strong and they were unable to find an equalizer.

Question 5: What is the significance of Saudi Arabia's victory over Argentina?
Answer: Saudi Arabia's vic

**For missed files**

In [6]:
import os
import json
import subprocess
import re

# Use absolute path to avoid FileNotFoundError
base_dir = os.path.abspath("..")
data_dir = os.path.join(base_dir, "temp")
output_dir = os.path.join(base_dir, "outputs", "insights_qa")
os.makedirs(output_dir, exist_ok=True)

# Check if data directory exists
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Data folder not found at: {data_dir}")

prompt_template = """
You are an expert in football analysis. Generate 5 structured question-answer pairs from the following match report.
Use concise and clear answers. Output the result as a JSON list with keys 'question' and 'answer'.

Match Report:
{report_text}
"""

def parse_qa_pairs(text):
    # Regular expression to match Q&A pairs with "Question" and "Answer" prefixes
    qa_pattern = re.compile(r"Question (\d+): (.+)\nAnswer: (.+)")
    matches = qa_pattern.findall(text)

    qa_pairs = []
    for match in matches:
        qa_pairs.append({
            "question": match[1].strip(),
            "answer": match[2].strip()
        })
    return qa_pairs

def generate_qa_from_text(text):
    prompt = prompt_template.format(report_text=text)

    try:
        result = subprocess.run(
            ["ollama", "run", "llama2"],
            input=prompt.encode('utf-8'),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=120
        )
        output = result.stdout.decode("utf-8")
        print("Subprocess output:", output)  # Debugging statement
        parsed_pairs = parse_qa_pairs(output)
        print("Parsed Q&A pairs:", parsed_pairs)  # Debugging statement
        return parsed_pairs

    except subprocess.TimeoutExpired:
        print("Timeout reached while generating response")
    except Exception as e:
        print(f"An error occurred: {e}")
    return None

for folder in os.listdir(data_dir):
    match_path = os.path.join(data_dir, folder)
    if not os.path.isdir(match_path):
        continue

    txt_files = [f for f in os.listdir(match_path) if f.endswith(".txt")]
    if not txt_files:
        print(f"No report.txt found in {folder}")
        continue

    with open(os.path.join(match_path, txt_files[0]), "r", encoding="utf-8") as f:
        report_text = f.read()

    print(f"Generating Q&A for: {folder}")
    qa_pairs = generate_qa_from_text(report_text)

    if qa_pairs:
        out_path = os.path.join(output_dir, f"{folder}.json")
        with open(out_path, "w", encoding="utf-8") as out_f:
            json.dump(qa_pairs, out_f, indent=2, ensure_ascii=False)
        print(f"Saved: {out_path}")
    else:
        print(f"Skipped: {folder}")


Generating Q&A for: 3_Arg_vs_Pol_G3
Subprocess output: Here are 5 structured question-answer pairs based on the provided match report:

Question 1: Who won the match between Poland and Argentina?
Answer: Argentina won the match with a score of 2-0.

Question 2: What was the catalyst for Argentina's win against Mexico that got their campaign back on track?
Answer: Lionel Messi was the catalyst for the crucial win against Mexico, illustrating why they were regarded as one of the tournament favourites.

Question 3: How did Poland qualify for the last 16 in the World Cup?
Answer: Poland qualified for the last 16 by virtue of the Fifa Fair Play rule, having fewer yellow cards than Mexico.

Question 4: What was the score when Saudi Arabia pulled back a goal deep into stoppage time against Mexico?
Answer: The score was 2-1 in favour of Saudi Arabia when they pulled back a goal deep into stoppage time against Mexico, which had a significant impact on Poland's qualification.

Question 5: How di