In [1]:
#!pip install firebase-admin
#!pip install pymupdf
#!pip install google-generativeai

In [18]:
# Import nessary libraries
import pandas as pd
import numpy as np
import firebase_admin
import fitz              # pymupdf
import re
import json
import google.generativeai as genai

from firebase_admin import credentials, firestore

In [3]:
# Initialize firebase
cred = credentials.Certificate("setting/serviceAccountKey.json")
firebase_admin.initialize_app(cred)
db = firestore.client()

In [4]:
# Initialize genai
genai.configure(api_key="AIzaSyBS-2pbdjYouOkcqHaX4ZI5HHPpSSmq3iw")
model = genai.GenerativeModel("gemini-2.5-flash")

### Import Question Types

In [5]:
# Read csv file
questionType_df = pd.read_csv("data/questionType.csv")

print(f"Example: \n{questionType_df.head()}\n\n")
print(f"Data Types: \n{questionType_df.dtypes}\n\n")

question_collection_ref = db.collection("QuestionType")
for _, row in questionType_df.iterrows():
    doc_id = row["typeID"]
    question_collection_ref.document(doc_id).set(row.to_dict())

print("QuestionTypes successfully uploaded to Firebase Firestore.")

Example: 
  typeID              type  part                instruction  \
0   T001  Table Completion     1  Complete the table below.   
1   T002   Form Completion     1   Complete the form below.   
2   T003    Plan Labelling     1      Label the plan below.   
3   T004     Map Labelling     2       Label the map below.   
4   T005   Multiple Choice     2                   Question   

                                       answer_format  \
0  Write ONE WORD AND / OR A NUMBER for each answer.   
1  Write ONE WORD AND / OR A NUMBER for each answer.   
2                     Write the correct letter, A-I.   
3                     Write the correct letter, A-I.   
4                           Choose TWO letters, A-E.   

                                         format  \
0  conversation or interview between 2 speakers   
1  conversation or interview between 2 speakers   
2  conversation or interview between 2 speakers   
3      talk or announcement given by one person   
4      talk or anno

### Training Set Extraction

In [6]:
def convert_references(df):
    rows = []
    for _, row in df.iterrows():
        entry = (
            f"TypeID: {row['typeID']}\n"
            f"Type: {row['type']}\n"
            f"Instruction: {row['instruction']}\n"
            f"Answer Format: {row['answer_format']}\n"
            f"Format: {row['format']}\n"
            "---------------------------"
        )
        rows.append(entry)
    return "\n".join(rows)

question_reference_text = convert_references(questionType_df)
print(question_reference_text)

TypeID: T001
Type: Table Completion
Instruction: Complete the table below.
Answer Format: Write ONE WORD AND / OR A NUMBER for each answer.
Format: conversation or interview between 2 speakers
---------------------------
TypeID: T002
Type: Form Completion
Instruction: Complete the form below.
Answer Format: Write ONE WORD AND / OR A NUMBER for each answer.
Format: conversation or interview between 2 speakers
---------------------------
TypeID: T003
Type: Plan Labelling
Instruction: Label the plan below.
Answer Format: Write the correct letter, A-I.
Format: conversation or interview between 2 speakers
---------------------------
TypeID: T004
Type: Map Labelling
Instruction: Label the map below.
Answer Format: Write the correct letter, A-I.
Format: talk or announcement given by one person
---------------------------
TypeID: T005
Type: Multiple Choice
Instruction: Question
Answer Format: Choose TWO letters, A-E.
Format: talk or announcement given by one person
---------------------------


In [7]:
PROMPT_TEXT = """
You are an expert IELTS Listening data extractor.
I will give you raw text extracted from an IELTS Listening paper. Your task is to analyse the text and separate it into sections and question types.

You are also given a list of official IELTS question types and their details:
---
{question_reference_text}
---

Your task is:
1. Each Section may contain multiple question types.
2. If a section has more than one question type, split them into separate sub-parts.
3. For every sub-part, extract:
   - Questions
   - Answers
   - Transcript
   - TypeID [Question Type]
4. Return **only** valid JSON format.
5. Your output should be like this format:
{{
  "Part 1": {{
    "Subparts": [
      {{
        "Questions": [],
        "Answers": [],
        "Transcript": "",
        "Question_Type": ""
      }},
      {{
        "Questions": [],
        "Answers": [],
        "Transcript": "",
        "Question_Type": ""
      }}
    ]
  }},
  "Part 2": {{
    "Subparts": [
      {{
        "Questions": [],
        "Answers": [],
        "Transcript": "",
        "Question_Type": ""
      }}
    ]
  }}
}}

Rules:
1. Do not invent missing text.
2. Keep all JSON arrays and strings valid.
3. If "Answers" or "Transcript" sections appear at the end, match answers and transcript to the correct part based on question numbers.
4. If any information is missing, return empty strings.

Here is the extracted PDF text:
---
{text}
---
"""


In [8]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text

def safe_json_parse(response_text):
    cleaned = re.sub(r"```(?:json)?", "", response_text)
    cleaned = cleaned.replace("```", "").strip()
    match = re.search(r'\{[\s\S]*\}', cleaned)
    if match:
        cleaned = match.group(0)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("JSON parsing failed. Raw model output:\n", response_text)
        return None
    
def extract_ielts_data(text):
    prompt = PROMPT_TEXT.format(text=text, question_reference_text=question_reference_text)
    response = model.generate_content(prompt)
    safe_response = safe_json_parse(response.text)
    print(safe_response)
    return safe_response

def process_pdf(pdf_path, start_id=1):
    print(f"\nProcessing {pdf_path} ...")
    text = extract_text_from_pdf(pdf_path)
    json_data = extract_ielts_data(text)

    if json_data is None:
        print("No valid JSON returned, skipping.")
        return pd.DataFrame(columns=["questionID", "question", "answer", "transcript", "typeID"])

    rows = []
    question_id = start_id

    for part_name, part_content in json_data.items():

        subparts = part_content.get("Subparts", [])

        for sub in subparts:
            q_type = sub.get("Question_Type", "")
            questions = sub.get("Questions", [])
            answers = sub.get("Answers", [])
            transcript = sub.get("Transcript", "")

            if not isinstance(questions, list):
                questions = [questions]
            if not isinstance(answers, list):
                answers = [answers]

            rows.append({
                "questionID": question_id, 
                "question": questions,
                "answer": answers,
                "transcript": transcript,
                "typeID": q_type
            })

            question_id += 1

    df = pd.DataFrame(rows)
    return df


In [None]:
output_csv = "data/training_set.csv"
training_set_df = pd.DataFrame(columns=["questionID", "question", "answer", "transcript", "typeID"])

pdf_folder = "training_set"
pdf_files = [f"training_set/sample_{i}.pdf" for i in range(1, 10)]

next_id = 1
for pdf_path in pdf_files:
    df = process_pdf(pdf_path, start_id=next_id)
    print(df)
    next_id += len(df)
    training_set_df = pd.concat([training_set_df, df], ignore_index=True)


Processing training_set/sample_1.pdf ...
{'Part 1': {'Subparts': [{'Questions': ['Account number: 1 _________', 'Company name: 2 _________', 'Envelopes Size: A4 Colour: 3 _________', 'Quantity: 4 _________', 'Photocopy paper Colour: 5 _________', 'Quantity: 6 _________'], 'Answers': ['692411', 'Rainbow Communications', 'white', 'two/2 boxes', 'light blue', '10 packs/10 packets'], 'Transcript': "Man: Sure! The account number is 6-9-2-4 double 1 [1]\nWoman: Six nine two four one one. Right, and you're from 'Rainbow Computers?'\nMan: No. The company is Rainbow Communications [2].\nWoman: Oh, OK, I'll just fix that on the system communications. And what would you like to order, John?\nMan: Envelopes. We need a box of A4 - that is, normal size envelopes.\nWoman: White, yellow or vanilla?\nMan: We'll have the plain white please [3] - but the ones with the little windows.\nWoman: OK. One box - A4 - white - just the one box, was it?\nMan: Uhm, on second thoughts make that two boxes [4]. We go

### Clean the Training Set

In [12]:
training_set_df.head()

Unnamed: 0,questionID,question,answer,transcript,typeID
0,1,"[Account number: 1 _________, Company name: 2 ...","[692411, Rainbow Communications, white, two/2 ...",Man: Sure! The account number is 6-9-2-4 doubl...,T002
1,2,"[7 _________, 8 _________, 9 _________]","[(coloured) floppy disks, (a/one) wall calenda...","Woman: Pens, paper clips, fax paper, computer ...",T011
2,3,[Special instructions: Deliver goods 10 ______...,[before 11.30 (AM)],"Man: Yes, good idea. And when can you deliver ...",T011
3,4,"[Venue: 11 _________, No. 1 12 _________, Date...","[Royal Museum, Queen's Park Road/Rd, 10th Dece...",The exhibition will include many different typ...,T011
4,5,"[According to Alison Sharp, bear ancestors dat...","[40 million, dogs/the dog, only ate plants]","Alison: Well, the bears we know today actually...",T009


In [13]:
print(f"Data Types: \n{training_set_df.dtypes}")
print(f"Number of Entries: {len(training_set_df)}")

Data Types: 
questionID    object
question      object
answer        object
transcript    object
typeID        object
dtype: object
Number of Entries: 79


In [19]:
training_set_df_clean = training_set_df.replace(r'^\s*$', np.nan, regex=True)
empty_rows = training_set_df_clean[training_set_df_clean.isna().any(axis=1)]
print(empty_rows)

    questionID                                           question  \
40          41  [Questions 4-7, Complete the form below., Writ...   
43          44  [Question 10, Write NO MORE THAN THREE WORDS f...   
44          45  [Questions 11–20, Complete the notes below., W...   
45          46  [Questions 21, 22, Complete the notes below., ...   
47          48  [Questions 27-30, Complete the chart about the...   
48          49  [Questions 31-32, Complete the notes using NO ...   
49          50  [Questions 33-36, Complete the table using NO ...   
50          51  [Questions 37-40, Complete the flow chart usin...   

                                               answer  \
40  [McDonald/Macdonald/MacDonald, Post Office Box...   
43                                [after (the) exams]   
44  [473, (open) 2/two(-)seater, smooth, 180, fram...   
45           [Out and About, (the) university/campus]   
47                   [Poor, Excellent, OK, Excellent]   
48  [human activity/activities, farm

  training_set_df_clean = training_set_df.replace(r'^\s*$', np.nan, regex=True)


In [20]:
training_set_df = training_set_df_clean.dropna()
print(f"Number of Entries after dropping NAs: {len(training_set_df)}")

Number of Entries after dropping NAs: 71


In [21]:
training_set_df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"\n All PDFs processed and saved to {output_csv}")


 All PDFs processed and saved to data/training_set.csv
