Done

In [1]:
#!pip install firebase-admin
#!pip install pymupdf
#!pip install google-generativeai

In [2]:
# Import nessary libraries
import pandas as pd
import numpy as np
import firebase_admin
import fitz              # pymupdf
import re
import json
import os
import google.generativeai as genai

from firebase_admin import credentials, firestore

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize firebase
cred = credentials.Certificate("backend/setting/serviceAccountKey.json")
firebase_admin.initialize_app(cred)
db = firestore.client()

In [4]:
# Initialize genai
genai.configure(api_key="AIzaSyCSC0LPUznCj0USGxAVXjXT_4vgVqp-ah4")
model = genai.GenerativeModel("gemini-2.5-flash")

### Import Question Types to Database

In [5]:
# Read csv file
questionType_df = pd.read_csv("processed_data/questionType.csv")

print(f"Example: \n{questionType_df.head()}\n\n")
print(f"Data Types: \n{questionType_df.dtypes}\n\n")

question_collection_ref = db.collection("QuestionType")
for _, row in questionType_df.iterrows():
    doc_id = row["typeID"]
    question_collection_ref.document(doc_id).set(row.to_dict())

print("QuestionTypes successfully uploaded to Firebase Firestore.")

Example: 
  typeID              type  part                instruction  \
0   T001  Table Completion     1  Complete the table below.   
1   T002   Form Completion     1   Complete the form below.   
2   T003    Plan Labelling     1      Label the plan below.   
3   T004     Map Labelling     2       Label the map below.   
4   T005   Multiple Choice     2                   Question   

                                       answer_format  \
0  Write ONE WORD AND / OR A NUMBER for each answer.   
1  Write ONE WORD AND / OR A NUMBER for each answer.   
2                     Write the correct letter, A-I.   
3                     Write the correct letter, A-I.   
4                           Choose TWO letters, A-E.   

                                         format  \
0  conversation or interview between 2 speakers   
1  conversation or interview between 2 speakers   
2  conversation or interview between 2 speakers   
3      talk or announcement given by one person   
4      talk or anno

### Training Set Extraction

In [6]:
def convert_references(df):
    rows = []
    for _, row in df.iterrows():
        entry = (
            f"TypeID: {row['typeID']}\n"
            f"Type: {row['type']}\n"
            f"Instruction: {row['instruction']}\n"
            f"Answer Format: {row['answer_format']}\n"
            f"Format: {row['format']}\n"
            "---------------------------"
        )
        rows.append(entry)
    return "\n".join(rows)

question_reference_text = convert_references(questionType_df)
print(question_reference_text)

TypeID: T001
Type: Table Completion
Instruction: Complete the table below.
Answer Format: Write ONE WORD AND / OR A NUMBER for each answer.
Format: conversation or interview between 2 speakers
---------------------------
TypeID: T002
Type: Form Completion
Instruction: Complete the form below.
Answer Format: Write ONE WORD AND / OR A NUMBER for each answer.
Format: conversation or interview between 2 speakers
---------------------------
TypeID: T003
Type: Plan Labelling
Instruction: Label the plan below.
Answer Format: Write the correct letter, A-I.
Format: conversation or interview between 2 speakers
---------------------------
TypeID: T004
Type: Map Labelling
Instruction: Label the map below.
Answer Format: Write the correct letter, A-I.
Format: talk or announcement given by one person
---------------------------
TypeID: T005
Type: Multiple Choice
Instruction: Question
Answer Format: Choose TWO letters, A-E.
Format: talk or announcement given by one person
---------------------------


In [7]:
PROMPT_TEXT = """
You are an expert IELTS Listening data extractor.
I will give you raw text extracted from an IELTS Listening paper. Your task is to analyse the text and separate it into sections and question types.

You are also given a list of official IELTS question types and their details:
---
{question_reference_text}
---

Your task is:
1. Each Section may contain multiple question types.
2. If a section has more than one question type, split them into separate sub-parts.
3. For every sub-part, extract:
   - Questions
   - Answers
   - Transcript
   - TypeID [Question Type]
4. Return **only** valid JSON format.
5. Your output should be like this format:
{{
  "Part 1": {{
    "Subparts": [
      {{
        "Questions": [],
        "Answers": [],
        "Transcript": "",
        "Question_Type": ""
      }},
      {{
        "Questions": [],
        "Answers": [],
        "Transcript": "",
        "Question_Type": ""
      }}
    ]
  }},
  "Part 2": {{
    "Subparts": [
      {{
        "Questions": [],
        "Answers": [],
        "Transcript": "",
        "Question_Type": ""
      }}
    ]
  }}
}}

Rules:
1. Do not invent missing text.
2. Keep all JSON arrays and strings valid.
3. If "Answers" or "Transcript" sections appear at the end, match answers and transcript to the correct part based on question numbers.
4. If any information is missing, return empty strings.

Here is the extracted PDF text:
---
{text}
---
"""


In [8]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text

def safe_json_parse(response_text):
    cleaned = re.sub(r"```(?:json)?", "", response_text)
    cleaned = cleaned.replace("```", "").strip()
    match = re.search(r'\{[\s\S]*\}', cleaned)
    if match:
        cleaned = match.group(0)
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("JSON parsing failed. Raw model output:\n", response_text)
        return None
    
def extract_ielts_data(text):
    prompt = PROMPT_TEXT.format(text=text, question_reference_text=question_reference_text)
    response = model.generate_content(prompt)
    safe_response = safe_json_parse(response.text)
    print(safe_response)
    return safe_response

def process_pdf(pdf_path, start_id=1):
    print(f"\nProcessing {pdf_path} ...")
    text = extract_text_from_pdf(pdf_path)
    json_data = extract_ielts_data(text)

    if json_data is None:
        print("No valid JSON returned, skipping.")
        return pd.DataFrame(columns=["questionID", "question", "answer", "transcript", "typeID"])

    rows = []
    question_id = start_id

    for part_name, part_content in json_data.items():

        subparts = part_content.get("Subparts", [])

        for sub in subparts:
            q_type = sub.get("Question_Type", "")
            questions = sub.get("Questions", [])
            answers = sub.get("Answers", [])
            transcript = sub.get("Transcript", "")

            if not isinstance(questions, list):
                questions = [questions]
            if not isinstance(answers, list):
                answers = [answers]

            questionID = 'Q' + str(question_id)

            rows.append({
                "questionID": questionID, 
                "question": questions,
                "answer": answers,
                "transcript": transcript,
                "typeID": q_type
            })

            question_id += 1

    df = pd.DataFrame(rows)
    return df


In [9]:
output_csv = "processed_data/training_set.csv"
training_set_df = pd.DataFrame(columns=["questionID", "question", "answer", "transcript", "typeID"])

pdf_folder = "data/training_set"
pdf_files = [f"data/training_set/sample_{i}.pdf" for i in range(1, 10)]

next_id = 1
for pdf_path in pdf_files:
    df = process_pdf(pdf_path, start_id=next_id)
    print(df)
    next_id += len(df)
    training_set_df = pd.concat([training_set_df, df], ignore_index=True)


Processing data/training_set/sample_1.pdf ...
{'Part 1': {'Subparts': [{'Questions': ['Complete the form below', 'Customer Order Form', 'Example', 'Order placed by: John Carter', 'Account number: 1 _________', 'Company name: 2 _________', 'Envelopes', 'Size: A4', 'Colour: 3 _________', 'Quantity: 4 _________', 'Photocopy paper', 'Colour: 5 _________', 'Quantity: 6 _________'], 'Answers': ['692411', 'Rainbow Communications', 'white', 'two/2 boxes', 'light blue', '10 packs/10 packets'], 'Transcript': "Recorded message: Thank you for calling Millennium Office Supplies. If you would like to place an order, please press \none. Your call has been placed in a queue. A customer service operator will be with you shortly.\nWoman: Gina speaking. How can I help you?\nMan: Oh, hello - I'd like to order some stationery, please.\nWoman: And who am I speaking to?\nMan: John Carter.\nWoman: Right - can I just confirm your account number and the name of your company, John?\nMan: Sure! The account numbe

### Clean the Training Set

In [10]:
training_set_df.head()

Unnamed: 0,questionID,question,answer,transcript,typeID
0,Q1,"[Complete the form below, Customer Order Form,...","[692411, Rainbow Communications, white, two/2 ...",Recorded message: Thank you for calling Millen...,T002
1,Q2,[Name THREE additional items the man requests....,"[(coloured) floppy disks, (a/one) wall calenda...",Woman: Ten packs of the light blue. Anything e...,T002
2,Q3,"[Complete the note., Special instructions: Del...",[before 11.30 (AM)],Woman: Yes! I'll pop one in with the order. Yo...,T011
3,Q4,"[Complete the notes below, General details:, V...","[Royal Museum, Queen's Park Road/Rd, 10th Dece...",IELTS Listening Section 2\nAnnouncer: And now ...,T011
4,Q5,"[Complete the sentences below, According to Al...","[40 million, dogs/the dog, only ate plants]",IELTS Listening Section 3\nInterviewer: Alison...,T009


In [11]:
print(f"Data Types: \n{training_set_df.dtypes}")
print(f"Number of Entries: {len(training_set_df)}")

Data Types: 
questionID    object
question      object
answer        object
transcript    object
typeID        object
dtype: object
Number of Entries: 83


In [12]:
training_set_df_clean = training_set_df.replace(r'^\s*$', np.nan, regex=True)
empty_rows = training_set_df_clean[training_set_df_clean.isna().any(axis=1)]
print(empty_rows)

   questionID                                           question  \
9         Q10  [Name: Andrea Brown, Address: 24 1 ________ Ro...   
10        Q11  [Number of days, Total distance, Price (per pe...   
11        Q12  [Which TWO facilities at the leisure club have...   
12        Q13  [Joining the leisure club, Personal Assessment...   
13        Q14  [Global Design Competition, 21 Students enteri...   
14        Q15  [22 John chose a dishwasher because he wanted ...   
15        Q16  [23 The stone in John’s ‘Rockpool’ design is u...   
16        Q17  [24 In the holding chamber, the carbon dioxide...   
17        Q18  [25 At the end of the cleaning process, the ca...   
18        Q19  [• John needs help preparing for his 26 ______...   
19        Q20  [THE SPIRIT BEAR, General facts, • It is a whi...   

                                               answer  \
9   [Ardleigh, newspaper, theme, tent, castle, bea...   
10                        [2020, flight, 429, dinner]   
11          

In [13]:
training_set_df = training_set_df_clean.dropna()
print(f"Number of Entries after dropping NAs: {len(training_set_df)}")

Number of Entries after dropping NAs: 72


In [14]:
training_set_df.to_csv(output_csv, index=False, encoding="utf-8")
print(f"\n All PDFs processed and saved to {output_csv}")


 All PDFs processed and saved to processed_data/training_set.csv


### Update database

In [15]:
# Read csv file
training_set_df = pd.read_csv("processed_data/training_set.csv")

trainign_collection_ref = db.collection("TrainingSet")
for _, row in training_set_df.iterrows():
    doc_id = row["questionID"]
    trainign_collection_ref.document(doc_id).set(row.to_dict())

print("Training Sets successfully uploaded to Firebase Firestore.")

Training Sets successfully uploaded to Firebase Firestore.


### Common Words

In [16]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text("text")
    return text

def extract_words_from_text(text):
    words = re.findall(r"[A-Za-z]+", text)

    words = [w.lower() for w in words]
    
    return words

def pdf_to_word_list(pdf_path):
    raw_text = extract_text_from_pdf(pdf_path)
    words = extract_words_from_text(raw_text)
    return words


def load_all_vocab_pdfs(folder="words_validation"):
    all_words = []

    for filename in os.listdir(folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder, filename)
            print(f"Processing: {pdf_path}")

            words = pdf_to_word_list(pdf_path)
            all_words.extend(words)

    # Remove duplicates
    all_words = sorted(list(set(all_words)))

    # Convert to DataFrame
    df_vocab = pd.DataFrame({"Words": all_words})
    return df_vocab

In [17]:
df_vocab = load_all_vocab_pdfs("data/words_validation")
df_vocab.head()

Processing: data/words_validation\academic_words.pdf
Processing: data/words_validation\band9_words.pdf
Processing: data/words_validation\common_words.pdf


Unnamed: 0,Words
0,a
1,abandoned
2,abduction
3,abiding
4,ability


In [18]:
df_vocab.to_csv("processed_data/ielts_vocab.csv", index=False)

In [19]:
df_vocab.nunique

<bound method DataFrame.nunique of           Words
0             a
1     abandoned
2     abduction
3       abiding
4       ability
...         ...
4975   yourself
4976      youth
4977    zealand
4978       zinc
4979       zone

[4980 rows x 1 columns]>