# Simplification

In [1]:
from openai import OpenAI
client = OpenAI(api_key="your-api-key-here")

In [2]:
def clean_llm_output(raw_output: str) -> str:
    """
    Cleans the raw output from the language model by removing unwanted formatting.
    """
    
    if "---" in raw_output:
        raw_output = raw_output.split("---")[0].strip()

    lines = raw_output.splitlines()

    if lines:
        lines[0] = lines[0].replace("*", "").strip()

    cleaned = "\n".join(
        " ".join(line.strip().split())
        for line in lines
        if line.strip()
    )

    return cleaned

In [3]:
conversion_config = { # Configuration to be used in prompts
  "model": "gpt-4o-2024-11-20",
  "temperature": 0.5,
  "top_p": 0.95,
  "max_tokens": 8192
}

## Syntactic

In [4]:
import json

with open("grammar/grammar_rules.json", "r", encoding="utf-8") as file:
        grammar_rules = json.load(file) # Load grammar rules from JSON file for syntactic simplification

level_order = ["A1", "A2", "B1"]

def load_grammar_rules(level):
    return grammar_rules.get(level, [])

In [5]:
with open("rules/simplification_rules_en.json", "r", encoding="utf-8") as file:
    simplification_rules = json.load(file) # Load syntactic simplification rules from JSON file

def load_simplification_rules(simplification_type, level):
    return simplification_rules[level][simplification_type].strip()

In [None]:
def simplify_syntactic(original_text, level, type="syntactic"):
    
    system_instruction = "You are an expert in syntactically simplifying sentences. Your task is to identify grammatical rules and morphemes in the given text that fall outside the specified language level and to simplify the text to this level using syntactic simplification rules."
    chat_history = [{"role": "system", "content": system_instruction}]

    simplification_rules = load_simplification_rules(type, level) # Load syntactic simplification rules
    grammar_rules = load_grammar_rules(level)
    
    prompt = (
            f"A text at the {level} level should not contain any structure other than the grammatical rules and morphemes given to you.\n"
            f"If the given text contains any structure other than these grammatical rules and morphemes, simplify the given text to the {level} level using syntactic simplification rules.\n\n---\n\n"
            f"The grammatical rules and morphemes for the {level} level are as follows:\n"
            f"- " + "\n- ".join(grammar_rules) + "\n\n---\n\n"
            f"{simplification_rules}\n\n---\n\n"
            f"Do not perform any action other than syntactic simplification. Preserve the content and details of the text. Do not delete sentences from the text. In the output, provide only the simplified text, do not write any explanation.\n\n---\n\n"
            f"Original Text:\n{original_text}\n"
            "Syntactically Simplified Text:\n")

    chat_history.append({"role": "user", "content": prompt})

    # Call the OpenAI API
    response = client.chat.completions.create(
        model=conversion_config["model"],
        messages=chat_history,
        temperature=conversion_config["temperature"],
        top_p=conversion_config["top_p"],
        max_tokens=conversion_config["max_tokens"],
    )
    
    output = response.choices[0].message.content
    cleaned_output = clean_llm_output(output)
    return cleaned_output

## Lexical

In [7]:
import spacy

nlp = spacy.load("en_core_web_sm")  # Load English language model for tokenization and lemmatization


def load_word_level_data(json_path="labeled_words/all_words.json"):
    """Load CEFR word level mappings from JSON file."""
    with open(json_path, "r", encoding="utf-8") as f:
        levels_dict = json.load(f)

    word_to_level_map = {}
    for level, words in levels_dict.items():
        for word in words:
            word_to_level_map[word.lower()] = level
            
    return word_to_level_map


def get_sentence_levels(sentence, word_map):
    """Analyze sentence and return CEFR level for each word."""
    if not word_map:
        print("Word map is empty. Cannot determine levels.")
        return {}
        
    doc = nlp(sentence)
    level_results = {}

    for token in doc:
        if token.is_punct:
            continue

        original_word = token.text
        word_text_lower = token.text.lower()
        word_lemma_lower = token.lemma_.lower()
        
        found_level = None

        # Check both original form and lemma for level matching
        if word_text_lower in word_map:
            found_level = word_map[word_text_lower]
        elif word_lemma_lower in word_map:
            found_level = word_map[word_lemma_lower]
        
        level_results[original_word] = found_level
        
    return level_results


# Load word level data for lexical analysis
JSON_FILE_PATH = 'labeled_words/all_words.json'
word_level_map = load_word_level_data(JSON_FILE_PATH)

In [8]:
def find_above_words(level: str, word_levels: dict) -> list:
    """Find words that are above the specified CEFR level."""
    
    levels = ["A1", "A2", "B1", "B2", "C1"]
    
    min_index = levels.index(level) + 1
    above_levels = set(levels[min_index:])

    return [word for word, lvl in word_levels.items() if lvl in above_levels]

In [9]:
def simplify_lexical(original_text, level, type="lexical"):
    system_instruction = "You are an expert in lexically simplifying sentences. Your task is to simplify the given text to the specified level using lexical simplification rules."
    chat_history = [{"role": "system", "content": system_instruction}]

    simplification_rules = load_simplification_rules(type, level) # Load lexical simplification rules
    
    word_levels = get_sentence_levels(original_text, word_level_map) # CEFR level for each word in the text
    words_above_prompt = ""
    words_above = find_above_words(level, word_levels) # Words above the target level
    
    if words_above:
        if len(words_above) == 1:
            words_above_prompt = f"In the following text, the word '{words_above[0]}' is above the {level} level. Without changing the meaning of the text, replace this word with a simpler word appropriate for the {level} level. If there is no simpler equivalent for this word or if it is important for the meaning of the text, do not change the word."
        elif len(words_above) == 2:
            words_above_prompt = f"In the following text, the words '{' and '.join(words_above)}' are above the {level} level. Without changing the meaning of the text, replace these words with simpler words appropriate for the {level} level. Do not change any words that have no simpler equivalent or are important for the meaning of the text."
        else:
            words_above_prompt = f"In the following text, the words {', '.join(words_above[:-1])}, and {words_above[-1]} are above the {level} level. Without changing the meaning of the text, replace these words with simpler words appropriate for the {level} level. Do not change any words that have no simpler equivalent or are important for the meaning of the text."
        
    prompt = (
            f"Simplify the given text to the {level} level using lexical simplification rules.\n\n---\n\n"
            f"{simplification_rules}\n\n---\n\n"
            f"{words_above_prompt}\n"
            f"Do not perform any action other than lexical simplification. Preserve the content and details of the text. Do not delete sentences from the text. In the output, provide only the simplified text, do not write any explanation.\n\n---\n\n"
            f"Original Text:\n{original_text}\n"
            "Lexically Simplified Text:\n")

    chat_history.append({"role": "user", "content": prompt})

    response = client.chat.completions.create(
        model=conversion_config["model"],
        messages=chat_history,
        temperature=conversion_config["temperature"],
        top_p=conversion_config["top_p"],
        max_tokens=conversion_config["max_tokens"],
    )
    
    output = response.choices[0].message.content
    cleaned_output = clean_llm_output(output)
    return cleaned_output

## Elaboration

In [10]:
def simplify_elaboration(original_text, level, type="elaboration"):
    system_instruction = "You are an expert in making sentences more understandable through the method of elaboration. Your task is to use elaboration rules to make the given text more understandable for a student at the specified level."
    chat_history = [{"role": "system", "content": system_instruction}]

    simplification_rules = load_simplification_rules(type, level) # Load elaboration rules
        
    prompt = (
            f"Using elaboration rules, make the given text more understandable for a student at the {level} level.\n\n---\n\n"
            f"{simplification_rules}\n\n---\n\n"
            f"Do not perform any action other than elaboration. Preserve the content and details of the text. Do not delete sentences from the text. In the output, provide only the elaborated text, do not write any explanation.\n\n---\n\n"
            f"Original Text:\n{original_text}\n"
            "Elaborated Text:\n")

    chat_history.append({"role": "user", "content": prompt})

    response = client.chat.completions.create(
        model=conversion_config["model"],
        messages=chat_history,
        temperature=conversion_config["temperature"],
        top_p=conversion_config["top_p"],
        max_tokens=conversion_config["max_tokens"],
    )
    
    output = response.choices[0].message.content
    cleaned_output = clean_llm_output(output)
    return cleaned_output

In [11]:
import os

def simplify_and_save(input_file_path, output_file_path):
    output_dir = os.path.dirname(output_file_path)
    
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        
    with open(input_file_path, 'r', encoding='utf-8') as infile, \
            open(output_file_path, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            data = json.loads(line)
            
            original_text = data.get("original")
            target_level = data.get("target_cefr")
            text_id = data.get("text_id")

            if original_text and target_level and text_id:
                print(f"--- Processing text_id: {text_id} ---")
                
                # Syntactic Simplification
                syntactic = clean_llm_output(simplify_syntactic(original_text, target_level))
                
                # Lexical Simplification
                lexical = clean_llm_output(simplify_lexical(syntactic, target_level))
                
                # Elaboration
                final_simplification = clean_llm_output(simplify_elaboration(lexical, target_level))
                
                output_data = {
                    "text_id": text_id,
                    "simplified": final_simplification
                }
                
                outfile.write(json.dumps(output_data) + '\n')
                
    print(f"\nProcessing complete. Output saved to '{output_file_path}'")

In [None]:
input_file = 'input_file.jsonl'
output_file = 'output_file.jsonl'

simplify_and_save(input_file, output_file)