# 配置 LanguageTool 的本地服务器模式

In [None]:
必须先配置 LanguageTool 的本地服务器模式，才能使用它。

**LanguageTool 作为服务器运行**
- LanguageTool 可以作为 HTTP 服务器运行，提供 REST API 接口。
- 运行命令：java -cp languagetool.jar org.languagetool.server.HTTPServer --port 8081
- 此后，其他应用程序（如 Python 脚本）可以通过 HTTP 请求与 LanguageTool 服务器通信。

## !!!重要：配置 LanguageTool 本地服务器模式的步骤

**首先，检查 Java 环境**

确保你的系统中安装了 Java 运行时环境（JRE）或 Java 开发工具包（JDK），因为 LanguageTool 依赖 Java 运行。  
运行以下命令检查 Java 是否已安装：  
java -version  
如果没有安装 Java，请下载并安装 Java SE Runtime Environment (JRE)。  

**其次，启动本地 LanguageTool 服务器**  

1. 下载 LanguageTool 的离线版本

访问 LanguageTool 官方下载页面[https://languagetool.org/download/LanguageTool-stable.zip] ，手动下载最新版本的 LanguageTool ZIP 文件。

2. 解压到指定目录

将下载的 ZIP 文件解压到指定目录（例如 V:\20240920\way1\LanguageTool-6.5）目录  

3. 手动启动 LanguageTool 服务器

打开命令行，导航到 LanguageTool 的目录（例如 V:\20240920\way1\LanguageTool-6.5）。  
运行命令启动服务器：java -cp languagetool.jar org.languagetool.server.HTTPServer --port 8081 ，这将启动一个本地服务器，监听 8081 端口。  

**最后，在代码中连接到本地服务器**  

1. 显式指定指定本地服务器的 URL，运行速度慢：
language_tool = language_tool_python.LanguageTool('en-us', remote_server='http://localhost:8081/')

2. 隐式指定指定缺省的本地服务器的 URL，运行速度快：
language_tool = language_tool_python.LanguageTool('en-us')：

python

# v1 微调韩国小姑娘版，速度快

In [11]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import language_tool_python

# 初始化 LanguageTool,连接到本地服务器
language_tool = language_tool_python.LanguageTool('en-us')
# language_tool = language_tool_python.LanguageTool('en-us', remote_server='http://localhost:8081/')

def correct_spelling(text):
    matches = language_tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches) # 速度快
    return corrected_text.lower()

def correct_array_with_threads(text_array):
    corrected_results = [None] * len(text_array)
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(correct_spelling, str(text)): idx for idx, text in enumerate(text_array)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="correcting typos", mininterval=0.5):
            idx = futures[future]
            try:
                corrected_results[idx] = future.result()
            except Exception as e:
                corrected_results[idx] = text_array[idx]

    return corrected_results

df = pd.read_csv(r"V:\20240920\way1\2_processed_data.csv")

text = df["processed_text"]

result = correct_array_with_threads(text)

# 关闭 LanguageTool
language_tool.close()

# df.loc["processed_text"] = result   # pandas2.0以后，这行代码有问题，应该改为：df["processed_text"] = result
df["processed_text"] = result

df.to_csv(r"V:\20240920\way1\3_correct_text_test.csv", index=False)

correcting typos: 100%|████████████████████████████████████████████████████████████| 5583/5583 [14:55<00:00,  6.24it/s]


# v2 deepseek改进版，速度慢

In [None]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import language_tool_python

# 初始化 LanguageTool,连接到本地服务器
language_tool = language_tool_python.LanguageTool('en-us')
# language_tool = language_tool_python.LanguageTool('en-us', remote_server='http://localhost:8081/')

def correct_spelling(text):
    """
    修正文本中的拼写和语法错误
    """
    matches = language_tool.check(text)
    corrected_text = language_tool.correct(text)  # 使用 correct 方法修正文本，速度慢
    return corrected_text.lower()  # 返回小写形式的修正文本

def correct_array_with_threads(text_array):
    """
    使用多线程修正文本数组
    """
    corrected_results = [None] * len(text_array)
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(correct_spelling, str(text)): idx for idx, text in enumerate(text_array)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Correcting typos"):
            idx = futures[future]
            try:
                corrected_results[idx] = future.result()
            except Exception as e:
                print(f"Error processing text at index {idx}: {e}")
                corrected_results[idx] = text_array[idx]  # 如果出错，保留原始文本

    return corrected_results

# 读取 CSV 文件
df = pd.read_csv(r"V:\20240920\way1\2_processed_data.csv")

# 获取需要处理的文本列
text = df["processed_text"].tolist()  # 将 Pandas Series 转换为列表

# 使用多线程修正文本
result = correct_array_with_threads(text)

# 关闭 LanguageTool
language_tool.close()

# 将修正后的结果写回 DataFrame
df["processed_text"] = result  # 直接赋值给列

# 保存到新的 CSV 文件
df.to_csv(r"V:\20240920\way1\3_correct_text_test.csv", index=False)


Correcting typos:   9%|█████▋                                                       | 525/5583 [02:39<25:32,  3.30it/s]


# V3a 我的改进版-基于deepseek改进版，速度慢
增加了：多线程运行的线程数量=CPU线程数量，因为实测发现这样速度快

In [1]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import language_tool_python
import multiprocessing

# 获取 CPU 的内核数量
cpu_count = multiprocessing.cpu_count()
thread_count = cpu_count

# 初始化 LanguageTool,连接到本地服务器
language_tool = language_tool_python.LanguageTool('en-us')
# language_tool = language_tool_python.LanguageTool('en-us', remote_server='http://localhost:8081/') 

# 初始化 LanguageTool
# language_tool = language_tool_python.LanguageTool('en-us')

def correct_spelling(text):
    """
    修正文本中的拼写和语法错误
    """
    matches = language_tool.check(text)
    corrected_text = language_tool.correct(text)  # 使用 correct 方法修正文本，速度慢
    return corrected_text.lower()  # 返回小写形式的修正文本

def correct_array_with_threads(text_array):
    """
    使用多线程修正文本数组
    """
    corrected_results = [None] * len(text_array)
    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        futures = {executor.submit(correct_spelling, str(text)): idx for idx, text in enumerate(text_array)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Correcting typos"):
            idx = futures[future]
            try:
                corrected_results[idx] = future.result()
            except Exception as e:
                print(f"Error processing text at index {idx}: {e}")
                corrected_results[idx] = text_array[idx]  # 如果出错，保留原始文本

    return corrected_results

# 读取 CSV 文件
df = pd.read_csv(r"V:\20240920\way1\2_processed_data.csv")

# 获取需要处理的文本列
text = df["processed_text"].tolist()  # 将 Pandas Series 转换为列表

# 使用多线程修正文本
result = correct_array_with_threads(text)

# 将修正后的结果写回 DataFrame
df["processed_text"] = result  # 直接赋值给列

# 保存到新的 CSV 文件
df.to_csv(r"V:\20240920\way1\3_correct_text_test.csv", index=False)

# 关闭 LanguageTool
language_tool.close()

Correcting typos: 100%|████████████████████████████████████████████████████████████| 5583/5583 [36:53<00:00,  2.52it/s]


# V3b 我的改进版-基于韩国小姑娘版，速度快
增加了：多线程运行的线程数量=CPU线程数量，因为实测发现这样速度快

In [12]:
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import language_tool_python
import multiprocessing

# 获取 CPU 的内核数量
cpu_count = multiprocessing.cpu_count()
thread_count = cpu_count

# 初始化 LanguageTool,连接到本地服务器
language_tool = language_tool_python.LanguageTool('en-us')
# language_tool = language_tool_python.LanguageTool('en-us', remote_server='http://localhost:8081/')

def correct_spelling(text):
    matches = language_tool.check(text)
    # corrected_text = language_tool.correct(text)  # 使用 correct 方法修正文本，速度慢
    corrected_text = language_tool_python.utils.correct(text, matches) # 速度快
    return corrected_text.lower()

def correct_array_with_threads(text_array):
    corrected_results = [None] * len(text_array)
    with ThreadPoolExecutor(max_workers=thread_count) as executor:
        futures = {executor.submit(correct_spelling, str(text)): idx for idx, text in enumerate(text_array)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="correcting typos", mininterval=0.5):
            idx = futures[future]
            try:
                corrected_results[idx] = future.result()
            except Exception as e:
                corrected_results[idx] = text_array[idx]

    return corrected_results

df = pd.read_csv(r"V:\20240920\way1\2_processed_data.csv")

text = df["processed_text"].tolist()  # 将 Pandas Series 转换为列表

result = correct_array_with_threads(text)

# 关闭 LanguageTool
language_tool.close()

# df.loc["processed_text"] = result   # pandas2.0以后，这行代码有问题，应该改为：df["processed_text"] = result
df["processed_text"] = result

df.to_csv(r"V:\20240920\way1\3_correct_text_test.csv", index=False)

correcting typos: 100%|████████████████████████████████████████████████████████████| 5583/5583 [07:21<00:00, 12.63it/s]


# test tips

## !!!重要：配置 LanguageTool 本地服务器模式的步骤

**首先，检查 Java 环境**

确保你的系统中安装了 Java 运行时环境（JRE）或 Java 开发工具包（JDK），因为 LanguageTool 依赖 Java 运行。  
运行以下命令检查 Java 是否已安装：  
java -version  
如果没有安装 Java，请下载并安装 Java SE Runtime Environment (JRE)。  

**其次，启动本地 LanguageTool 服务器**  

1. 下载 LanguageTool 的离线版本

访问 LanguageTool 官方下载页面[https://languagetool.org/download/LanguageTool-stable.zip] ，手动下载最新版本的 LanguageTool ZIP 文件。

2. 解压到指定目录

将下载的 ZIP 文件解压到指定目录（例如 V:\20240920\way1\LanguageTool-6.5）目录  

3. 手动启动 LanguageTool 服务器

打开命令行，导航到 LanguageTool 的目录（例如 V:\20240920\way1\LanguageTool-6.5）。  
运行命令启动服务器：java -cp languagetool.jar org.languagetool.server.HTTPServer --port 8081 ，这将启动一个本地服务器，监听 8081 端口。  

**最后，在代码中连接到本地服务器**  

1. 显式指定指定本地服务器的 URL，运行速度慢：
language_tool = language_tool_python.LanguageTool('en-us', remote_server='http://localhost:8081/')

2. 隐式指定指定缺省的本地服务器的 URL，运行速度快：
language_tool = language_tool_python.LanguageTool('en-us')：

python

## 2way of correct single sentence

### my code

In [7]:
import language_tool_python

# Initialize LanguageTool
language_tool = language_tool_python.LanguageTool('en-US')

# Input text
text = "Ir would help woman and mothers to get better paying jobs in todays marketplaaaaace."

## way 1： use language_tool_python.utils.correct()
print("\n ----way 1： use language_tool_python.utils.correct()----")

# Check and correct the text
matches = language_tool.check(text)
corrected_text = language_tool_python.utils.correct(text, matches)

# Print the corrected text
print("Original Text:")
print(text)
print("\nCorrected Text:")
print(corrected_text)

## way 2： use tool.correct()
print("\n ----way 2： use language_tool_python.LanguageTool('en-US').correct()----")

# Initialize LanguageTool
language_tool = language_tool_python.LanguageTool('en-US')

# Input text
text = "Ir would help woman and mothers to get better paying jobs in today's marketplaaaaace."

# Check and correct the text
matches = language_tool.check(text)
corrected_text = language_tool.correct(text)

# Print the corrected text
print("Original Text:")
print(text)
print("\nCorrected Text:")
print(corrected_text)

# Close LanguageTool
tool.close()


 ----way 1： use language_tool_python.utils.correct()----
Original Text:
Ir would help woman and mothers to get better paying jobs in todays marketplaaaaace.

Corrected Text:
Ir would help woman and mothers to get better paying jobs in today's marketplaaaaace.

 ----way 2： use language_tool_python.LanguageTool('en-US').correct()----
Original Text:
Ir would help woman and mothers to get better paying jobs in today's marketplaaaaace.

Corrected Text:
Ir would help woman and mothers to get better paying jobs in today's marketplaaaaace.


### chatGPT version

In [8]:
import language_tool_python

# Initialize LanguageTool for English (US)
tool = language_tool_python.LanguageTool('en-US')

# Text with errors
text = "Ir would help woman and mothers to get better paying jobs in todays marketplaaaaace."

# Check for mistakes
matches = tool.check(text)

# Correct the text
corrected_text = language_tool_python.utils.correct(text, matches)

# Print the corrected text
print("Corrected Text:", corrected_text)

# Close LanguageTool
tool.close()

Corrected Text: Ir would help woman and mothers to get better paying jobs in today's marketplaaaaace.


In [7]:
language_tool = language_tool_python.LanguageTool('en-us')

In [None]:
# 关闭 LanguageTool
language_tool.close()

### TextBlob

In [9]:
from textblob import TextBlob

# Text with errors
text = "Ir would help woman and mothers to get better paying jobs in todays marketplaaaaace."

# Create a TextBlob object
blob = TextBlob(text)

# Correct the text
corrected_text = blob.correct()

# Print the corrected text
print("Corrected Text:", corrected_text)


Corrected Text: Or would help woman and mothers to get better paying jobs in today marketplaaaaace.


### first language_tool_python for grammar checks then TextBlob for spelling corrections.

In [10]:
import language_tool_python
from textblob import TextBlob

# Initialize LanguageTool for English (US)
tool = language_tool_python.LanguageTool('en-US')

# Text with errors
text = "Ir would help woman and mothers to get better paying jobs in todays marketplaaaaace."

# Step 1: Use language_tool_python to check and fix grammar issues
matches = tool.check(text)
corrected_text = language_tool_python.utils.correct(text, matches)

# Step 2: Use TextBlob to correct spelling
blob = TextBlob(corrected_text)
final_corrected_text = blob.correct()

# Print the final corrected text
print("Final Corrected Text:", final_corrected_text)


Final Corrected Text: Or would help woman and mothers to get better paying jobs in today's marketplaaaaace.


### first spelling with TextBlob then apply language_tool_python for grammar corrections

In [11]:
import language_tool_python
from textblob import TextBlob

# Initialize LanguageTool for English (US)
tool = language_tool_python.LanguageTool('en-US')

# Text with errors
text = "Ir would help woman and mothers to get better paying jobs in todays marketplaaaaace."

# Step 1: Use TextBlob to fix spelling mistakes
blob = TextBlob(text)
corrected_spelling_text = str(blob.correct())

# Step 2: Use language_tool_python for grammar issues (after spelling correction)
matches = tool.check(corrected_spelling_text)
final_corrected_text = language_tool_python.utils.correct(corrected_spelling_text, matches)

# Print the final corrected text
print("Final Corrected Text:", final_corrected_text)


Final Corrected Text: Or would help woman and mothers to get better paying jobs in today marketplaaaaace.
