### Save path, text, their split version, file names (text ids)


In [1]:
import glob
import os

import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\seohy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# PATH of folder containing learner texts in txt
folder_path = r"C:\Users\seohy\nlplearnerdata\raw_text"

In [3]:
# Source: https://github.com/braj29/Text-to-Conll/blob/main/texts_to_conll.py
# Copy-pasted and adjusted the code for converting txt to conllu
# Get paths of each learner text from folder_path
def get_paths(path):
    paths_ = glob.glob(path + "/*.txt")
    return paths_

def load_text(txt_path):
    with open(txt_path, "r", encoding="utf-8") as infile:
        content = infile.read()
    return content

In [4]:
txt_paths = get_paths(folder_path) # PATHs of .txt files
texts = [] # raw text
sentences = [] # raw text into sentences
names = [] # names of files (text id)

for txt_path in txt_paths:
    text = load_text(txt_path)

    texts.append(text)
    sentences.append(sent_tokenize(text))
    
    # Source: https://medium.com/@emoome82/extracting-file-names-without-extensions-in-python-caabe8532f92
    name, extension = os.path.splitext(os.path.basename(txt_path))
    names.append(name)

In [5]:
# Verify path, sentences, and names
print(txt_paths[:3])
print(texts[:3])
print(sentences[:3])
print(names[:3])

['C:\\Users\\seohy\\nlplearnerdata\\raw_text\\1_A20210.txt', 'C:\\Users\\seohy\\nlplearnerdata\\raw_text\\1_A20503.txt', 'C:\\Users\\seohy\\nlplearnerdata\\raw_text\\1_A30406.txt']
["I will introduce my favorite things. First, I love cats. Though I don't have a cat, they looks so cute. Next, I love green. Green is color of nature. I think green is a very comfortable color. Last, I like music. Music makes me satisfied. Pop songs are exciting, and Kpops are exciting, too.", "I will introduce my favorite k-pop group. They are boynextdoor. Boynextdoor is a team that is included six members. Every members are all rounder. Especially, they are good at singing. They use only hand mic every time. They always do their best and that's the reason why i like them a lot.", 'I like listen to music because it makes me comfortable. and I love puppy. when i see the puppy, I feel happy. Lastly I like band LUCY". When I listening the LUCY\'s music,I feel happy. They give the 위로 to many people."']
[['I wi

### Language-tool-python


In [6]:
# Set Java PATH for current ipynb session for language_tool_python local server
import os

os.environ["JAVA_HOME"] = r"C:\Program Files\Java\jdk-25"
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["JAVA_HOME"], "bin")

# The following code should return the path to the java.exe file
import shutil
print(shutil.which("java"))

C:\Program Files\Common Files\Oracle\Java\javapath\java.EXE


In [7]:
# Import libraries
import language_tool_python
from language_tool_python.utils import classify_matches
import pandas as pd
import openpyxl

In [8]:
# Download and set up local server 
tool = language_tool_python.LanguageTool("en-US")

In [None]:
# Running the text on the model
for path, text, name in zip(txt_paths, texts, names):
    match = tool.check(text)

    # Convert and output results into dataframe and excel
    df_language_tool_python = pd.DataFrame([m.__dict__ for m in match])
    df_language_tool_python.to_excel(f"C:\\Users\\seohy\\nlplearnerdata\\references\\learner_errors\\ref_language_tool_python\\{name}.xlsx")

### gector


In [10]:
from transformers import AutoTokenizer
from gector.modeling import GECToR
from gector.predict import predict, load_verb_dict
import torch

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# Set display width for the columns of df
pd.set_option("display.max_colwidth", None)

In [13]:
model_id = "gotutiyan/gector-xlnet-large-cased-5k"
model = GECToR.from_pretrained(model_id)

tokenizer = AutoTokenizer.from_pretrained(model_id)

encode, decode = load_verb_dict(r"C:\Users\seohy\nlplearnerdata\references\learner_errors\data\verb-from-vocab.txt")

In [None]:
for srcs, name in zip(sentences, names):
    # The corrected sentence with their tags
    corrected = None

    try:
        # inference in no_grad mode
        with torch.no_grad():
            corrected = predict(
                model, tokenizer, srcs,
                encode, decode,
                keep_confidence=0.0,
                min_error_prob=0.0,
                n_iteration=5,
                batch_size=2,
            )
    except RuntimeError as e:
        print("Error during inference:", e)
        # you could try fallback: smaller batch size or CPU mode here
        # e.g. model.cpu(), batch_size=1, etc.

    gector_df = pd.DataFrame({
        "Gector_correction": corrected
    })

    gector_df.to_excel(f"C:\\Users\\seohy\\nlplearnerdata\\references\\learner_errors\\ref_gector\\{name}.xlsx")

Iteratoin 0. the number of to_be_processed: 9
Iteratoin 1. the number of to_be_processed: 2
Iteratoin 0. the number of to_be_processed: 7
Iteratoin 1. the number of to_be_processed: 6
Iteratoin 2. the number of to_be_processed: 3
Iteratoin 3. the number of to_be_processed: 1
Iteratoin 0. the number of to_be_processed: 6
Iteratoin 1. the number of to_be_processed: 5
Iteratoin 2. the number of to_be_processed: 1
Iteratoin 0. the number of to_be_processed: 8
Iteratoin 1. the number of to_be_processed: 1
Iteratoin 0. the number of to_be_processed: 3
Iteratoin 1. the number of to_be_processed: 3
Iteratoin 0. the number of to_be_processed: 8
Iteratoin 1. the number of to_be_processed: 6
Iteratoin 2. the number of to_be_processed: 3
Iteratoin 0. the number of to_be_processed: 9
Iteratoin 1. the number of to_be_processed: 4
Iteratoin 0. the number of to_be_processed: 6
Iteratoin 1. the number of to_be_processed: 4
Iteratoin 2. the number of to_be_processed: 3
Iteratoin 3. the number of to_be_p

### Gramformer


In [15]:
from gramformer import Gramformer
import torch

In [16]:
gf_1 = Gramformer(models = 1, use_gpu = False) # 1 = corrector, 2 = detector



[Gramformer] Grammar error correct/highlight model loaded..


In [17]:
for srcs, name in zip(sentences, names):
    data = []

    for sentence in srcs:    
        corrected = gf_1.correct(sentence, max_candidates=1)
        for c in corrected:
            data.append((c))

    df_gramformer = pd.DataFrame(data, columns=["Gramformer_correction"])
    df_gramformer.to_csv(f"C:\\Users\\seohy\\nlplearnerdata\\references\\learner_errors\\ref_gramformer\\{name}.csv")

### Combining the three dfs into one df and save as csv


In [18]:
for name in names:
    # Read in
    df_gector = pd.read_excel(f"ref_gector/{name}.xlsx", index_col = 0)
    df_gector.loc[-1] = df_gector.columns 
    df_gector.index = df_gector.index + 1
    df_gector = df_gector.sort_index()
    df_gector.columns = ["message"]

    df_gramformer = pd.read_csv(f"ref_gramformer/{name}.csv", index_col = 0)
    df_gramformer.loc[-1] = df_gramformer.columns 
    df_gramformer.index = df_gramformer.index + 1
    df_gramformer = df_gramformer.sort_index()
    df_gramformer.columns = ["replacements"]
    
    df_language_tool_python = pd.read_excel(f"ref_language_tool_python/{name}.xlsx", index_col = 0)

    df_gector_gramformer = pd.concat([df_gector, df_gramformer], axis = 1)

    # Combine all dataframes into one dataframe
    df_total = pd.concat([df_language_tool_python, df_gector_gramformer.reindex(columns = df_language_tool_python.columns)], axis = 0, ignore_index = True)

    # Output combined df into out_total folder
    df_total.to_csv(f"ref_results/{name}.csv")