In [None]:
import pandas as pd
import os
import csv
import random

## Replacing special characters before convering into XML


In [None]:
# Path of raw texts in .txt 
folder_path = r"PATH"
os.makedirs(folder_path, exist_ok=True)
print(os.path.exists(folder_path))

# Path of texts with converted special chracters
output_folder = r"PATH"
os.makedirs(output_folder, exist_ok=True)
print(os.path.exists(output_folder))

In [None]:
# Replacement map for XML string
replacements = {
    "&": "&amp;",   # Must replace first
    "<": "&lt;",
    ">": "&gt;",
    '"': "&quot;",
    "'": "&apos;",
    "_x000D_": "", # Replace carriage returns, already newline in .txt 
}

In [None]:
# Convert special chracters of one .txt text
def escape_xml_chars(text, file_path):
    # For each pair in the replacement map
    for char, replacement in replacements.items():
        # If special char = dict key in text
        if char in text:
            # Replace and print that replacement occurred
            text = text.replace(char, replacement)
            print(f"Replaced {char} → '{replacement} for {file_path}")
    # Return transformed text
    return text

In [None]:
# Iterate through files in the folder_path
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()

    # Transform and save text into escaped_content  
    escaped_content = escape_xml_chars(text, file_path)

    # Save transformed text to output folder (keeping same filename)
    output_path = os.path.join(output_folder, filename)
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(escaped_content)

## Creating a basic dataframe for XML tag and attribute

Cf) Note that language_socore and content_score shall be added in the next section


1. &lt;entry&gt;: a single text by one learner at one time period

- id: a unique identifier for each text (i.e., filename)
- time_period: year and term when the data was collected (i.e., "24-2")
- task: identifier for the specific writing prompt or task (e.g., "T1")
- language_score: score learner received on language
- content_score: score learner received on content

2. &lt;learner&gt;: information about the learner who wrote the text

- id: A unique identifier for the learner (i.e., 학번)
- grade: the grade of the learner when the text was written

3. Text-related tags

- &lt;text_original&gt;: the original text written by the learner
- &lt;text_error&gt;: the error-tagged version of the original text
- &lt;text_corrected&gt;: the corrected version of the text"""


In [None]:
# Initialize lists for creating dataframes
entry_id_list = []
time_period_list = []
task_list = []
learner_id_list = []
grade_list = []
text_original_list = []
text_error_list = []
text_corrected_list = []

In [None]:
# Read in each transformed text
for filename in os.listdir(output_folder):
    file_path = os.path.join(output_folder, filename)

    entry_id_list.append(filename[:8]) # Save filename w/hout .txt
    time_period_list.append("24-2") # All periods currently 24-2
    task_list.append("T" + filename[0]) # Concat "T" w/h task number 
    learner_id_list.append(filename[3:8]) # Save 학번 from filename
    grade_list.append(filename[3]) # Save first number from 학번
    
    # Read and work with file content
    with open(file_path, "r", encoding="utf-8") as file:
        # Before error tagging, all same as original text 
        text = file.read()
        text_original_list.append(text)
        text_error_list.append(text)
        text_corrected_list.append(text)

In [None]:
# Create dictionary with the saved lists
dict_corpus = {}

dict_corpus["entry_id"] = entry_id_list
dict_corpus["time_period"] = time_period_list
dict_corpus["task"] = task_list
dict_corpus["learner_id"] = learner_id_list
dict_corpus["grade"] = grade_list
dict_corpus["text_original"] = text_original_list
dict_corpus["text_error"] = text_error_list
dict_corpus["text_corrected"] = text_corrected_list

In [None]:
# Determine number of values for each key
# Check that same number for all key
# Set index=list(range(1, 605) when creating dataframe
value_counts = {key: len(values) for key, values in dict_corpus.items()}
print(value_counts)

# Create and show dataframe of corpus data
df = pd.DataFrame(dict_corpus, index=list(range(1, 605)))
df.head(5)

## Adding language_socore and content_score to df


In [None]:
# Initialize lists for saving language and content scores for each task 
T1_cont_list = [] # Task 1, content scores
T1_lang_list = [] # Task 1, language scores
T2_cont_list = [] # Task 2, content scores
T2_lang_list = [] # Task 2, language scores

# Same entry ids as above, needed for matching scores with merge
entry_id_list_T1 = []
entry_id_list_T2 = []

In [None]:
# Read in the csv of T1 scores
task1_scores = pd.read_csv("student_scores_task1.csv")

# Iterate through each rows in the csv
for index, row in task1_scores.iterrows():
    # Combine  number + student_id, check if in "total" folder
    id = row["student_id"]
    entry_id = "1_" + id
    file_name = "1_" + id + ".txt"

    # If combination in "total" folder, retrieve language and content score
    if file_name in os.listdir(output_folder):
        entry_id_list_T1.append(entry_id)
        T1_cont_list.append(row["T1_Cont"])
        T1_lang_list.append(row["T1_Lang"])

In [None]:
# Repeat the same process for T2
# Read in the csv of T2 scores
task2_scores = pd.read_csv("student_scores_task2.csv")

# Iterate through each rows in the csv
for index, row in task2_scores.iterrows():
    # Combine  number + student_id, check if in "total" folder
    id = row["student_id"]
    entry_id = "2_" + id
    file_name = "2_" + id + ".txt"

    # If combination in "total" folder, retrieve language and content score
    if file_name in os.listdir(output_folder):
        entry_id_list_T2.append(entry_id)
        T2_cont_list.append(row["T2_Cont"])
        T2_lang_list.append(row["T2_Lang"])

In [None]:
# Create dictionary of scores with the saved lists
dict_scores = {}

# Connect the lists in order; order is not rearranged
dict_scores["entry_id"] = entry_id_list_T1 + entry_id_list_T2
dict_scores["content_score"] = T1_cont_list + T2_cont_list
dict_scores["language_score"] = T1_lang_list + T2_lang_list

In [None]:
# Determine number of values for each key, same as creating original df above
value_counts = {key: len(values) for key, values in dict_scores.items()}
print(value_counts)

df_socres = pd.DataFrame(dict_scores, index=list(range(1, 605)))
df_socres.head(5)

In [None]:
# Merge the two dataframes together
# Merge scores based on matching with "entry_id" or filename
entries_total = df.merge(df_socres, on="entry_id", how="left")

In [None]:
# Verify that merge is done properly with no NaN values
print(entries_total.isna().sum())

In [None]:
# Print out the dataframe of corpus data
entries_total.head(5)

In [None]:
# Svae the dataframe of corpus data
entries_total.to_csv("entries_total_raw.csv")

## Convert dataframe into and save XML format


In [None]:
# Initialize empty list for storing entries
xml_entries = []

In [None]:
# Open dataframe 
with open("entries_total_raw.csv", newline='', encoding="utf-8") as csvfile:
    # Create DictReader to treat/read each csv row like a dictionary
    # The csv columns become its keys, and the cells become their values
    reader = csv.DictReader(csvfile)
    # For each row of the csv file
    for row in reader:
        # Create each xml entry using string formatting  
        xml_entries.append(f'''\t\t<entry id="{row['entry_id']}" time_period="{row['time_period']}" task="{row['task']}" content_score="{row['content_score']}" language_score="{row['language_score']}"> 
            <learner id="{row['learner_id']}" grade="{row['grade']}"/> 
            <text_original>{row['text_original']}</text_original>
            <text_error>{row['text_error']}</text_error>
            <text_corrected>{row['text_corrected']}</text_corrected>
        </entry>
        ''')

In [None]:
# Wrap all entries inside a root tag
xml_content = "<writings>\n" + "\n".join(xml_entries) + "\n</writings>"

In [None]:
# Save the corpus entries into XML
with open("entries_total_raw.xml", "w", encoding="utf-8") as f:
    f.write(xml_content)

## Choose 20 random files for verification


In [None]:
# Create a list of files in output_folder
all_file_names = [file for file in os.listdir(output_folder)]

In [None]:
# Set the seed for reproducing results
random.seed(1002)

In [None]:
# Select and print random file names
num_files = 20
random_files = random.sample(all_file_names, 20)

In [None]:
# Print sampled results based on seed
for file in random_files:
    print(file)