In [31]:
import pandas as pd
import os
import logging
import joblib
import docx
import fitz
import PyPDF2
import tkinter as tk
import csv
from tkinter import filedialog, messagebox, scrolledtext
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from deep_translator import GoogleTranslator
from PyPDF2 import PdfReader
from docx import Document
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO)

directory = "C:/Users/pillya/Documents/ish/vkr/datasets"

def convert_tsv_to_csv(directory):
    """Конвертация всех .tsv в .csv в указанной папке."""
    for filename in os.listdir(directory):
        if filename.endswith('.tsv'):
            tsv_file = os.path.join(directory, filename)
            csv_file = os.path.join(directory, filename.replace('.tsv', '.csv'))
            try:
                with open(tsv_file, 'r', newline='', encoding='utf-8') as infile, \
                     open(csv_file, 'w', newline='', encoding='utf-8') as outfile:
                    reader = csv.reader(infile, delimiter='\t')
                    writer = csv.writer(outfile)
                    for row in reader:
                        writer.writerow(row)
                logging.info(f'Converted {tsv_file} → {csv_file}')
            except Exception as e:
                logging.error(f"Error converting {tsv_file}: {e}")

def process_csv_files(directory):
    """Вставляем заголовки ['text','value'] и сохраняем поверх."""
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            try:
                df = pd.read_csv(file_path, header=None, encoding='utf-8')
                df.columns = ['text', 'value']
                df.to_csv(file_path, index=False, encoding='utf-8')
                logging.info(f'Processed and overwritten {filename}')
            except Exception as e:
                logging.error(f"Error processing {filename}: {e}")

def load_dataset(file_path):
    """Просто читаем CSV."""
    return pd.read_csv(file_path, encoding='utf-8')

def clean_dataset(df):
    """Удаляем NaN, пустые и не-печатаемые строки."""
    df.columns = ['text', 'value']
    df = df.dropna(subset=['text', 'value'])
    df = df[df['text'].str.strip() != '']
    df = df[df['value'].str.strip() != '']
    df = df[df['text'].apply(lambda x: all(c.isprintable() for c in x))]
    df = df[df['value'].apply(lambda x: all(c.isprintable() for c in x))]
    return df

def load_and_clean_dataset(directory, lang):
    """Загрузка + чистка + сэмплирование одного языка."""
    file_path = os.path.join(directory, f'{lang}.csv')
    try:
        df = load_dataset(file_path)
        df = clean_dataset(df)
        df = df[['text']].copy()
        df['label'] = lang
        # Ограничим до 5000 записей для баланса
        df = df.sample(n=min(5000, len(df)), random_state=42)
        logging.info(f'Cleaned {lang}.csv — {len(df)} rows; label={lang}')
        return df
    except Exception as e:
        logging.error(f"Error in load_and_clean_dataset({lang}): {e}")
        return None
    
def load_and_clean_datasets(directory, languages):
    """Параллельная загрузка и чистка всех языков."""
    datasets = []
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(load_and_clean_dataset, directory, lang) for lang in languages]
        for future in futures:
            df = future.result()
            if df is not None:
                datasets.append(df)
    return datasets

In [32]:
# 1) Конвертация и предобработка файлов
convert_tsv_to_csv(directory)
process_csv_files(directory)

# Определим список языков по именам CSV-файлов (без расширения)
languages = [os.path.splitext(f)[0] for f in os.listdir(directory) if f.endswith('.csv')]

# Загрузим и почистим
dfs = load_and_clean_datasets(directory, languages)

# Собираем тексты и метки
all_texts = []
all_labels = []
for df in dfs:
    all_texts.extend(df['text'].tolist())
    all_labels.extend(df['label'].tolist())

# 2) TF-IDF + кодирование меток
tfidf = TfidfVectorizer(max_features=20000)
X = tfidf.fit_transform(all_texts).toarray()

y_encoder = LabelEncoder()
y = y_encoder.fit_transform(all_labels)
joblib.dump(y_encoder, 'label_encoder.joblib')

# 3) Разбиение на train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4) Модель Keras
def build_model(input_dim, num_classes):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_dim,)),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

ARTIFACT_DIR = os.getcwd()
MODEL_FILE = os.path.join(ARTIFACT_DIR, 'keras_lang_model.h5')
VECT_FILE = os.path.join(ARTIFACT_DIR, 'tfidf_vectorizer.joblib')
ENCOD_FILE = os.path.join(ARTIFACT_DIR, 'label_encoder.joblib')

loaded = False
try:
    model = load_model(MODEL_FILE)
    logging.info("Loaded existing Keras model")
    tfidf   = joblib.load(VECT_FILE)
    y_encoder = joblib.load(ENCOD_FILE)
    logging.info("Загружены готовые файлы: модель, векторизатор, энкодер")
    loaded = True
except Exception as e:
    logging.info("Не удалось загрузить артефакты, будет произведено обучение:", e)

if not loaded:
    logging.info("Начинаем обучение модели Keras...")
    model = build_model(X_train.shape[1], len(y_encoder.classes_))
    es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    model.fit(
        X_train, y_train,
        validation_split=0.1,
        epochs=20,
        batch_size=32,
        callbacks=[es]
    )
    model.save(MODEL_FILE)
    joblib.dump(tfidf, VECT_FILE)
    joblib.dump(y_encoder, ENCOD_FILE)
    logging.info("Trained and saved new Keras model")

loss, acc = model.evaluate(X_test, y_test, verbose=0)
logging.info(f"Test accuracy: {acc:.2%}")

2025-05-05 21:44:04,835 INFO: Converted C:/Users/pillya/Documents/ish/vkr/datasets\ger.tsv → C:/Users/pillya/Documents/ish/vkr/datasets\ger.csv
2025-05-05 21:44:04,856 INFO: Converted C:/Users/pillya/Documents/ish/vkr/datasets\gre.tsv → C:/Users/pillya/Documents/ish/vkr/datasets\gre.csv
2025-05-05 21:44:04,870 INFO: Converted C:/Users/pillya/Documents/ish/vkr/datasets\rus.tsv → C:/Users/pillya/Documents/ish/vkr/datasets\rus.csv
2025-05-05 21:44:04,885 INFO: Converted C:/Users/pillya/Documents/ish/vkr/datasets\tur.tsv → C:/Users/pillya/Documents/ish/vkr/datasets\tur.csv
2025-05-05 21:44:04,923 INFO: Processed and overwritten ger.csv
2025-05-05 21:44:04,951 INFO: Processed and overwritten gre.csv
2025-05-05 21:44:04,969 INFO: Processed and overwritten rus.csv
2025-05-05 21:44:04,986 INFO: Processed and overwritten tur.csv
2025-05-05 21:44:05,130 INFO: Cleaned tur.csv — 5000 rows; label=tur
2025-05-05 21:44:05,176 INFO: Cleaned rus.csv — 5000 rows; label=rus
2025-05-05 21:44:05,178 INFO: 

Epoch 1/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.3894 - loss: 1.2628 - val_accuracy: 0.5444 - val_loss: 0.9348
Epoch 2/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.8120 - loss: 0.5406 - val_accuracy: 0.5719 - val_loss: 0.8698
Epoch 3/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.8633 - loss: 0.3047 - val_accuracy: 0.5994 - val_loss: 0.8523
Epoch 4/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.8729 - loss: 0.2867 - val_accuracy: 0.5913 - val_loss: 0.8324
Epoch 5/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.8821 - loss: 0.2639 - val_accuracy: 0.6006 - val_loss: 0.8293
Epoch 6/20
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.8774 - loss: 0.2632 - val_accuracy: 0.5981 - val_loss: 0.8477
Epoch 7/20
[1m450/450

2025-05-05 21:45:16,270 INFO: Trained and saved new Keras model
2025-05-05 21:45:16,605 INFO: Test accuracy: 60.15%


In [38]:
# 5. Функции извлечения, предсказания и перевода
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.txt':
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
        
    elif ext == '.pdf':
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text
    
    elif ext == '.docx':
        doc = Document(file_path)
        return '\n'.join([para.text for para in doc.paragraphs])

    else:
        raise ValueError(f"Формат файла {ext} не поддерживается.")

def detect_language_from_file(file_path: str) -> str:
    text = extract_text_from_file(file_path)
    if not text.strip():
        raise ValueError("Файл пустой или текст не извлечён.")
    X_vec = tfidf.transform([text]).toarray()
    probs = model.predict(X_vec)[0]
    return y_encoder.inverse_transform([probs.argmax()])[0]

def translate_text(text: str, source_label: str, target_labels: list) -> dict:
    translations = {}
    #ISO код источника
    src_iso = LABEL_TO_ISO.get(source_label)
    if not src_iso:
        raise ValueError(f"Unknown source language label: {source_label}")
    
    for tgt_label in target_labels:
        tgt_iso = LABEL_TO_ISO.get(tgt_label)
        if not tgt_iso:
            translations[tgt_label] = "Unsupported target label"
            continue

        try:
                translations[tgt_label] = GoogleTranslator(
                    source=src_iso,
                    target=tgt_iso
                ).translate(text)
        except Exception as e:
            translations[tgt_label] = f"Ошибка перевода: {e}"
    return translations

# 6. GUI на Tkinter
LABEL_TO_ISO = {
    'rus': 'ru',
    'ger': 'de',
    'gre': 'el',
    'tur': 'tr'
}

SUPPORTED_LANGS = list(LABEL_TO_ISO.keys())
SUPPORTED_LANGS = [lang for lang in SUPPORTED_LANGS if lang in y_encoder.classes_]

root = tk.Tk()
root.title("Language Detector with Keras")

file_path = None

def browse_file():
    global file_path
    file_path = filedialog.askopenfilename(
        filetypes=[("Text files", "*.txt"), 
                   ("PDF", "*.pdf"), 
                   ("Word", "*.docx")]
    )
    if file_path:
        messagebox.showinfo("Файл выбран", os.path.basename(file_path))

def run_detection():
    if not file_path:
        messagebox.showwarning("Внимание", "Сначала выберите файл.")
        return
    try:
        label = detect_language_from_file(file_path)
        result_box.delete(1.0, tk.END)
        result_box.insert(tk.END, f"Определённый язык: {label}\n\n")
        text = extract_text_from_file(file_path)
        targets = [code for code, var in lang_vars.items() if var.get()]
        if not targets:
            targets = SUPPORTED_LANGS

        translations = translate_text(text, label, targets)
        for tgt, tr in translations.items():
            result_box.insert(tk.END, f"{tgt}: {tr}\n\n")

    except Exception as e:
        messagebox.showerror("Ошибка", str(e))

# Layout
frame = tk.Frame(root)
frame.pack(padx=10, pady=10, fill=tk.X)

tk.Button(frame, text="Выбрать файл", command=browse_file).pack(side=tk.LEFT, padx=5)

lang_vars = {}
for code in SUPPORTED_LANGS:
    var = tk.BooleanVar(value=True)
    cb = tk.Checkbutton(frame, text=code, variable=var)
    cb.pack(side=tk.LEFT, padx=2)
    lang_vars[code] = var

tk.Button(root, text="Определить и перевести", 
          command=run_detection).pack(side=tk.LEFT, pady=5)

result_box = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=80, height=20)
result_box.pack(padx=10, pady=10)

root.mainloop()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


In [34]:
print(y_encoder.classes_)

['ger' 'gre' 'rus' 'tur']
