<a href="https://colab.research.google.com/github/Vakhranev/MDB/blob/main/%D0%9F%D1%80%D0%BE%D0%B2%D0%B5%D1%80%D0%BA%D0%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install lexical-diversity

Collecting lexical-diversity
  Downloading lexical_diversity-0.1.1-py3-none-any.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lexical-diversity
Successfully installed lexical-diversity-0.1.1


In [3]:
import os
import math
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize
from collections import Counter
from math import comb
from lexical_diversity import lex_div as ld

class MetricsCalculator:
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.ttr_data = {}
        self.vocd_data = {}
        self.hd_data = {}
        self.mtld_data = {}

    def preprocess_text(self, text):
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'\n', '', text)
        return text

    def calculate_ttr(self, lemmas):
        total_lemmas = len(lemmas)
        unique_lemmas = len(set(lemmas))
        ttr = unique_lemmas / total_lemmas
        return ttr

    def calculate_vocd_d(self, lemmas):
        lemma_count = len(lemmas)
        unique_lemmas = set(lemmas)
        unique_lemma_count = len(unique_lemmas)

        freq_dict = Counter(lemmas)
        freq_values = list(freq_dict.values())

        freq_sum = sum(freq_values)
        sorted_freqs = sorted(freq_values, reverse=True)

        cumulative_freq = 0
        vocd_d = None

        for idx, freq in enumerate(sorted_freqs):
            cumulative_freq += freq
            if cumulative_freq / freq_sum >= self.threshold:
                vocd_d = (idx + 1) / unique_lemma_count
                break

        return vocd_d

    def process_files(self, file_paths):
        vocd_values = []
        hd_values = []
        mtld_values = []

        for filepath in file_paths:
            with open(filepath, "r", encoding="utf-8") as file:
                text = file.read()
                preprocessed_text = self.preprocess_text(text)
                tokens = word_tokenize(preprocessed_text)
                lemmatizer = nltk.WordNetLemmatizer()
                lemmas = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha()]
                vocd = self.calculate_vocd_d(lemmas)
                hd = ld.hdd(lemmas)
                mtld = ld.mtld(lemmas)
                vocd_values.append(vocd)
                hd_values.append(hd)
                mtld_values.append(mtld)

        return vocd_values, hd_values, mtld_values

    def calculate_avg_metrics(self, vocd_values, hd_values, mtld_values):
        avg_vocd = sum(vocd_values) / len(vocd_values)
        avg_hd = sum(hd_values) / len(hd_values)
        avg_mtld = sum(mtld_values) / len(mtld_values)

        return avg_vocd, avg_hd, avg_mtld

if __name__ == "__main__":
    # Задайте пути к вашим файлам 1.txt, 2.txt и 3.txt
    file_paths = ["1.txt", "2.txt", "3.txt"]

    metrics_calculator = MetricsCalculator(threshold=0.72)
    vocd_values, hd_values, mtld_values = metrics_calculator.process_files(file_paths)

    for i, (vocd, hd, mtld) in enumerate(zip(vocd_values, hd_values, mtld_values)):
        print(f"File {i + 1} ({file_paths[i]}):")
        print(f"  voc-D: {vocd:.4f}")
        print(f"  HD-D: {hd:.4f}")
        print(f"  MTLD: {mtld}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


File 1 (1.txt):
  voc-D: 1.0000
  HD-D: 0.0000
  MTLD: 3.1111111111111116
File 2 (2.txt):
  voc-D: 0.8000
  HD-D: 0.0000
  MTLD: 0.0
File 3 (3.txt):
  voc-D: 0.8000
  HD-D: 0.0000
  MTLD: 5.6000000000000005
