In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

# === 1. Загрузка данных ===
df_control = pd.read_excel('norm.xlsx')
df_md = pd.read_excel('dep.xlsx')
df_pd = pd.read_excel('sz.xlsx')

# Таблица с важными метриками и их важностями
important_features_df = pd.read_excel('important.xlsx')

# Вытаскиваем только важные признаки
important_features = important_features_df['Feature'].tolist()
feature_importance = dict(zip(important_features_df['Feature'], important_features_df['normalized importance']))

# Подготовка данных
df_control['label'] = 'control'
df_md['label'] = 'D'
df_pd['label'] = 'D'

df = pd.concat([df_control, df_md, df_pd], ignore_index=True)

X = df[important_features]
y = df['label']

le = LabelEncoder()
y = le.fit_transform(y)

# Умножаем каждый признак на его importance
for feature in important_features:
    X[feature] = X[feature] * feature_importance[feature]

# === 2. Деление на трейн/тест ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# === 3. Балансировка классов через SMOTE ===
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# === 4. Масштабирование признаков ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler.pkl')

# === 5. Объявление моделей ===
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [5, 10, None]}
rf_grid = GridSearchCV(rf, rf_params, cv=3, n_jobs=-1)

lr = LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42)
lr_params = {'C': [0.01, 0.1, 1, 10]}
lr_grid = GridSearchCV(lr, lr_params, cv=3, n_jobs=-1)

svm = SVC(class_weight='balanced', probability=True, random_state=42)
svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
svm_grid = GridSearchCV(svm, svm_params, cv=3, n_jobs=-1)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 5, 7]}
xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, n_jobs=-1)

cat = CatBoostClassifier(verbose=0, random_seed=42)

# === 6. Обучение моделей ===
models = {
    'Random Forest': rf_grid,
    'Logistic Regression': lr_grid,
    'SVM': svm_grid,
    'XGBoost': xgb_grid,
    'CatBoost': cat
}

results = []

# Словарь для хранения F1 для каждого класса отдельно
class_f1_scores = {model: [] for model in models}

for name, model in models.items():
    if name in ['Logistic Regression', 'SVM']:
        model.fit(X_train_scaled, y_train_balanced)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train_balanced, y_train_balanced)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_per_class = f1_score(y_test, y_pred, average=None)  # F1 для каждого класса

    results.append((name, acc, f1_macro))
    class_f1_scores[name] = f1_per_class  # Сохраняем F1 для каждого класса

# === 7. Ансамбли ===
# Voting Classifier
voting = VotingClassifier(estimators=[
    ('rf', rf_grid.best_estimator_),
    ('xgb', xgb_grid.best_estimator_),
    ('cat', cat)
], voting='soft')
voting.fit(X_train_balanced, y_train_balanced)
voting_pred = voting.predict(X_test)
voting_acc = accuracy_score(y_test, voting_pred)
voting_f1 = f1_score(y_test, voting_pred, average='macro')
results.append(('Voting Ensemble', voting_acc, voting_f1))
class_f1_scores['Voting Ensemble'] = f1_score(y_test, voting_pred, average=None)

# Stacking Classifier
stacking = StackingClassifier(
    estimators=[('lr', lr_grid.best_estimator_), ('svm', svm_grid.best_estimator_)],
    final_estimator=RandomForestClassifier(random_state=42)
)
stacking.fit(X_train_scaled, y_train_balanced)
stacking_pred = stacking.predict(X_test_scaled)
stacking_acc = accuracy_score(y_test, stacking_pred)
stacking_f1 = f1_score(y_test, stacking_pred, average='macro')
results.append(('Stacking Ensemble', stacking_acc, stacking_f1))
class_f1_scores['Stacking Ensemble'] = f1_score(y_test, stacking_pred, average=None)

# === 8. Вывод результатов ===
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Macro F1-score'])
print(results_df)

# Вывод F1 для каждого класса отдельно
print("\nF1 Score для каждого класса:")
for model, f1_scores in class_f1_scores.items():
    print(f"\n{model}:")
    for i, f1 in enumerate(f1_scores):
        class_name = le.inverse_transform([i])[0]  # Преобразуем индекс в оригинальное имя класса
        print(f"  {class_name}: {f1}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = X[feature] * feature_importance[feature]
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



                 Model  Accuracy  Macro F1-score
0        Random Forest  0.792135        0.786467
1  Logistic Regression  0.780899        0.774925
2                  SVM  0.775281        0.768711
3              XGBoost  0.780899        0.773100
4             CatBoost  0.792135        0.786467
5      Voting Ensemble  0.808989        0.803404
6    Stacking Ensemble  0.747191        0.742104

F1 Score для каждого класса:

Random Forest:
  D: 0.7516778523489933
  control: 0.821256038647343

Logistic Regression:
  D: 0.738255033557047
  control: 0.8115942028985508

SVM:
  D: 0.7297297297297297
  control: 0.8076923076923077

XGBoost:
  D: 0.7310344827586207
  control: 0.8151658767772512

CatBoost:
  D: 0.7516778523489933
  control: 0.821256038647343

Voting Ensemble:
  D: 0.7702702702702703
  control: 0.8365384615384616

Stacking Ensemble:
  D: 0.7058823529411765
  control: 0.7783251231527094


In [None]:
import joblib

# Сохраняем модель Voting Classifier
joblib.dump(voting, 'voting_classifier_model.pkl')

['voting_classifier_model.pkl']

In [None]:
import numpy as np
import re
import joblib
import spacy
from ruts import ReadabilityStats

# Загружаем модель spaCy для русского языка
nlp = spacy.load("ru_core_news_sm")

# Функции для расчета метрик
def calculate_flesch_kincaid(text):
    rs = ReadabilityStats(text)
    rs.get_stats()
    return rs.get_stats()['flesch_kincaid_grade']

def count_syllables(word):
    word = word.lower()
    vowels = "аеёиоуыэюя"
    return sum(1 for char in word if char in vowels)

def count_complex_words(words):
    complex_words = [word for word in words if count_syllables(word) >= 3]
    return len(complex_words)

def calculate_gunning_fog_index(text):
    sentences = re.split(r'[.!?…]', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    words = re.findall(r'\b[А-Яа-яЁё\-]+\b', text)

    total_words = len(words)
    total_sentences = len(sentences)

    complex_words = count_complex_words(words)

    if total_sentences == 0 or total_words == 0:
        return 0

    gfi = 0.4 * ((total_words / total_sentences) + 100 * (complex_words / total_words))
    return round(gfi, 2)

def calculate_ttr(text):
    words = re.findall(r'\b\w+\b', text.lower())
    total_words = len(words)
    unique_words = len(set(words))
    return unique_words / total_words if total_words != 0 else 0

def count_pos(text):
    doc = nlp(text)
    nouns = [token for token in doc if token.pos_ == 'NOUN']
    pronouns = [token for token in doc if token.pos_ == 'PRON']
    adverbs = [token for token in doc if token.pos_ == 'ADV']

    total_tokens = len([token for token in doc if not token.is_punct and not token.is_space])
    return {
        'noun_ratio': len(nouns) / total_tokens if total_tokens != 0 else 0,
        'pronoun_ratio': len(pronouns) / total_tokens if total_tokens != 0 else 0,
        'adverb_ratio': len(adverbs) / total_tokens if total_tokens != 0 else 0
    }

def calculate_objectivity_coefficient(text):
    words = word_tokenize(text.lower())

    tagged_words = pos_tag(words)

    subject_tags = {'NN', 'NNS', 'NNP', 'NNPS',
                    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
                    'JJ', 'JJR', 'JJS',
                    'RB', 'RBR', 'RBS'}


    subject_words = [word for word, tag in tagged_words if tag in subject_tags]

    stop_words = set(stopwords.words('russian'))
    meaningful_words = [word for word in words if word.isalpha() and word not in stop_words]

    if len(meaningful_words) == 0:
        return 0
    subjectivity_coefficient = len(subject_words) / len(meaningful_words)

    return subjectivity_coefficient

def calculate_dynamism_coefficient(text):
    words = word_tokenize(text.lower())

    tagged_words = pos_tag(words)

    verb_tags = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
    noun_adjective_pronoun_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'PRP', 'PRP$', 'WP', 'WP$'}

    verb_count = sum(1 for word, tag in tagged_words if tag in verb_tags)

    noun_adjective_pronoun_count = sum(1 for word, tag in tagged_words if tag in noun_adjective_pronoun_tags)

    if noun_adjective_pronoun_count == 0:
        return float('inf') if verb_count > 0 else 0
    dynamism_coefficient = verb_count / noun_adjective_pronoun_count

    return dynamism_coefficient

# Функция для предсказания на основе модели
def predict_text_class(text, model):
    # Рассчитываем метрики
    flesch_kincaid = calculate_flesch_kincaid(text)
    gunning_fog = calculate_gunning_fog_index(text)
    ttr = calculate_ttr(text)
    pos_counts = count_pos(text)
    dynamism = calculate_dynamism_coefficient(text)
    objectivity = calculate_objectivity_coefficient(text)

    # Подготовка признаков
    features = np.array([[flesch_kincaid, gunning_fog, ttr, pos_counts['noun_ratio'], pos_counts['pronoun_ratio'], pos_counts['adverb_ratio'], dynamism, objectivity]])

    # Масштабируем признаки
    scaler = joblib.load('scaler.pkl')  # Загружаем заранее сохраненный скейлер
    features_scaled = scaler.transform(features)

    # Получаем предсказание
    prediction = model.predict(features_scaled)

    if prediction == 0:
        return "Норма"
    else:
        return "Клиническая группа"

# Загружаем модель Voting Classifier
model = joblib.load('voting_classifier_model.pkl')

# Пример использования
user_input = input("Введите текст для классификации: ")
classification_result = predict_text_class(user_input, model)
print(f"Результат классификации: {classification_result}")

Введите текст для классификации: Это не гвозди называется, это по-другому как-то называется, я забыла как. Надо взять ножку... надо взять две ножки и прицепить их к основе. Для этого надо использовать два гвоздя. И вставить их в специальные отверстия. Потом мы смотрим на две ножки и там есть еще два отверстия, и мы должны вставить туда палочки. Потом поставить туда еще две ножки, забить туда гвозди, забить гвозди... забить гвозди в палочки и поставить маленький гамак или что это. Завернуть их в палочки, которые расположены между ножками. Всё.
Результат классификации: Норма


