In [1]:
import os
import subprocess
import json

**Q&A from match reports**

In [None]:
import os
import csv
import subprocess
import re

# Base and data/output directory setup
base_dir = os.path.abspath("..")
data_dir = os.path.join(base_dir, "data")
output_dir = os.path.join(base_dir, "outputs", "insights_qa")
os.makedirs(output_dir, exist_ok=True)

# Verify data folder exists
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Data folder not found at: {data_dir}")

# Updated prompt with stricter output instruction
prompt_template = """
You are a professional football analyst. Read the following match reports. Your goal is to generate 5 concise and high-quality question-answer pairs based on Argentina's performance in the match.
You may refer to the opponent's strategy or events only when it directly relates to Argentina’s tactics, decisions, or key moments.
Use concise and clear answers. Output the result as a JSON list with keys 'question' and 'answer'.

Instructions:
1. Read the match report carefully.
2. Identify key moments, strategies, and decisions made by Argentina.
3. Formulate questions that are relevant to Argentina's performance.
4. Provide clear and concise answers based on the report.
5. Ensure that the output is in JSON format with 'question' and 'answer' keys.

Match Report:
{report_text}
"""

def parse_qa_pairs(output):
    import json

    # 1. Try JSON list
    try:
        json_start = output.find('[')
        json_end = output.rfind(']') + 1
        json_data = output[json_start:json_end]
        qa_list = json.loads(json_data)
        if isinstance(qa_list, list) and all("question" in qa and "answer" in qa for qa in qa_list):
            return qa_list
    except Exception as e:
        print("JSON parsing failed. Trying fallback:", str(e))

    # 2. Fallback: Numbered Q&A like "1. What...? - Answer"
    qa_pattern = re.compile(r"\d+\.\s*(.+?)\s*[-–]\s*(.+)")
    matches = qa_pattern.findall(output)
    if matches:
        return [{"question": q.strip(), "answer": a.strip()} for q, a in matches]

    # 3. Fallback: Many individual JSON objects (not in a list)
    qa_json_pattern = re.compile(r'\{\s*"question":\s*".+?",\s*"answer":\s*".+?"\s*\}', re.DOTALL)
    matches = qa_json_pattern.findall(output)
    try:
        return [json.loads(block) for block in matches]
    except Exception as e:
        print("Could not parse individual QA JSON blocks:", str(e))

    return []  # Still nothing matched

def generate_qa_from_text(text):
    prompt = prompt_template.format(report_text=text)

    try:
        result = subprocess.run(
            ["ollama", "run", "llama2"],
            input=prompt.encode('utf-8'),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=120
        )
        output = result.stdout.decode("utf-8").strip()
        print("Subprocess output:", output[:300], "...\n")  # Truncated for brevity
        return parse_qa_pairs(output), output

    except subprocess.TimeoutExpired:
        print("Timeout reached while generating response")
    except Exception as e:
        print(f"Error during subprocess: {e}")
    return None, None

def save_qa_to_csv(qa_pairs, file_path):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["question", "answer"])
        writer.writeheader()
        for qa in qa_pairs:
            writer.writerow(qa)

# Main loop to process each match folder
for folder in os.listdir(data_dir):
    match_path = os.path.join(data_dir, folder)
    if not os.path.isdir(match_path):
        continue

    txt_files = [f for f in os.listdir(match_path) if f.endswith(".txt")]
    if not txt_files:
        print(f"No text files found in {folder}")
        continue

    print(f"\nProcessing match: {folder}")
    for txt_file in txt_files:
        txt_file_path = os.path.join(match_path, txt_file)

        with open(txt_file_path, "r", encoding="utf-8") as f:
            report_text = f.read()

        print(f"Generating Q&A for file: {txt_file}")
        qa_pairs, raw_output = generate_qa_from_text(report_text)

        if qa_pairs:
            out_file_name = f"{folder}_{os.path.splitext(txt_file)[0]}.csv"
            out_path = os.path.join(output_dir, out_file_name)
            save_qa_to_csv(qa_pairs, out_path)
            print(f"Saved: {out_path}")
        else:
            print(f"Skipped (no Q&A parsed): {txt_file}")
            # Save raw output to a text file
            raw_out_file_name = f"{folder}_{os.path.splitext(txt_file)[0]}_raw.txt"
            raw_out_path = os.path.join(output_dir, raw_out_file_name)
            with open(raw_out_path, "w", encoding="utf-8") as raw_out_f:
                raw_out_f.write(raw_output)
            print(f"Saved raw output: {raw_out_path}")

print("\nAll processing complete.")


In [1]:
import os
import json
import re

# Base and data/output directory setup
base_dir = os.path.abspath("..")
data_dir = os.path.join(base_dir, "outputs", "insights_qa")
output_dir = os.path.join(base_dir, "outputs", "insights_qa_json")
os.makedirs(output_dir, exist_ok=True)

def parse_qa_pairs(text):
    # Regular expression to match Q&A pairs with flexible formats
    qa_pattern = re.compile(r"(?:Question|Q(?:uestion)?)?\s*(\d*)\s*[:-]\s*(.+)\n(?:Answer|A(?:nswer)?)?\s*[:-]\s*(.+)", re.IGNORECASE)
    matches = qa_pattern.findall(text)

    qa_pairs = []
    for match in matches:
        qa_pairs.append({
            "question": match[1].strip(),
            "answer": match[2].strip()
        })
    return qa_pairs

def save_qa_to_json(qa_pairs, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(qa_pairs, file, indent=2, ensure_ascii=False)

# Main loop to process each raw text file
for file_name in os.listdir(data_dir):
    if file_name.endswith("_raw.txt"):
        file_path = os.path.join(data_dir, file_name)
        print(f"Processing file: {file_path}")

        with open(file_path, "r", encoding="utf-8") as file:
            raw_output = file.read()

        qa_pairs = parse_qa_pairs(raw_output)

        if qa_pairs:
            json_file_name = file_name.replace("_raw.txt", ".json")
            json_file_path = os.path.join(output_dir, json_file_name)
            save_qa_to_json(qa_pairs, json_file_path)
            print(f"Saved: {json_file_path}")
        else:
            print(f"Skipped (no Q&A parsed): {file_name}")

print("Processing complete.")


Processing file: d:\Masters\hcnlp_project\outputs\insights_qa\1_Arg_vs_Saudi_G1_Text_report_BBC_raw.txt
Saved: d:\Masters\hcnlp_project\outputs\insights_qa_json\1_Arg_vs_Saudi_G1_Text_report_BBC.json
Processing file: d:\Masters\hcnlp_project\outputs\insights_qa\1_Arg_vs_Saudi_G1_Text_Report_The_Guardian_raw.txt
Saved: d:\Masters\hcnlp_project\outputs\insights_qa_json\1_Arg_vs_Saudi_G1_Text_Report_The_Guardian.json
Processing file: d:\Masters\hcnlp_project\outputs\insights_qa\2_Arg_vs_Mex_G2_Text_Report_BBC_raw.txt
Saved: d:\Masters\hcnlp_project\outputs\insights_qa_json\2_Arg_vs_Mex_G2_Text_Report_BBC.json
Processing file: d:\Masters\hcnlp_project\outputs\insights_qa\2_Arg_vs_Mex_G2_Text_Report_The_Guardian_raw.txt
Saved: d:\Masters\hcnlp_project\outputs\insights_qa_json\2_Arg_vs_Mex_G2_Text_Report_The_Guardian.json
Processing file: d:\Masters\hcnlp_project\outputs\insights_qa\3_Arg_vs_Pol_G3_Text_Report_BBC_raw.txt
Saved: d:\Masters\hcnlp_project\outputs\insights_qa_json\3_Arg_vs_Pol_