<a href="https://colab.research.google.com/github/l-Monarch-l/Laborat/blob/main/LAB_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import re
import nltk
import string
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [16]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
# Загрузка файла
with open('scp_object.txt', 'r', encoding='utf-8') as f:
    content = f.read()

In [11]:
# Разделение статей по разделителю '===='
articles = [article.strip() for article in content.split('====') if article.strip()]

In [12]:
# Парсинг метаданных и текста
data = []
for article in articles:
    # Извлечение Organization, Name, Description
    org_match = re.search(r'Organization:\s*(.*)', article)
    name_match = re.search(r'Name:\s*(.*)', article)
    desc_match = re.search(r'Description:\s*(.*?)\n\n', article, re.DOTALL)

    org = org_match.group(1).strip() if org_match else None
    name = name_match.group(1).strip() if name_match else None
    desc = desc_match.group(1).strip() if desc_match else None

    # Основной текст (все после Description)
    text = article.split('Description:', 1)[-1].strip() if desc else article.strip()

    data.append({
        'organization': org,
        'name': name,
        'description': desc,
        'text': text
    })

In [13]:
# Создание DataFrame
df = pd.DataFrame(data)
print(df.head())

     organization     name                                        description  \
0  SCP Foundation  SCP-001  THE FOLLOWING FILES HAVE BEEN CLASSIFIED\nTOP ...   
1  SCP Foundation  SCP-002                                   rating: +2187+–x   
2  SCP Foundation  SCP-003                                   rating: +1009+–x   
3  SCP Foundation  SCP-004                                   rating: +1331+–x   
4  SCP Foundation  SCP-005                                    rating: +871+–x   

                                                text  
0  THE FOLLOWING FILES HAVE BEEN CLASSIFIED\nTOP ...  
1  rating: +2187+–x\n\n\nSCP-002 in its containme...  
2  rating: +1009+–x\n\n\nA close up of SCP-003's ...  
3  rating: +1331+–x\n\n\nSCP-004-1\n\n\nItem #: S...  
4  rating: +871+–x\n\n\nA close up of SCP-005\n\n...  


In [14]:
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    # Удаление пунктуации и приведение к нижнему регистру
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()

    # Токенизация
    tokens = word_tokenize(text)

    # Удаление стоп-слов и лемматизация
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return ' '.join(tokens)

In [17]:
df['processed_text'] = df['text'].apply(preprocess_text)

In [18]:
# Пример анализа: распределение по организациям
print(df['organization'].value_counts())

organization
SCP Foundation    2995
Name: count, dtype: int64


In [19]:
# Длина текста
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
print(df['text_length'].describe())

count     2996.000000
mean      1293.693258
std        979.096622
min         96.000000
25%        740.750000
50%       1062.000000
75%       1568.000000
max      27649.000000
Name: text_length, dtype: float64


In [20]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['processed_text'])

In [21]:
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

df['cluster'] = clusters
print(df[['name', 'cluster']].head())

      name  cluster
0  SCP-001        2
1  SCP-002        2
2  SCP-003        2
3  SCP-004        2
4  SCP-005        2


In [22]:
if 'organization' in df:
    df['org_code'] = pd.factorize(df['organization'])[0]
    ari = adjusted_rand_score(df['org_code'], df['cluster'])
    print(f"Adjusted Rand Index: {ari:.2f}")

Adjusted Rand Index: -0.00


In [23]:
# разделим на train+val и test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, df['organization'], test_size=0.2, random_state=42
)

# Затем разделим train_val на train и val
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

In [35]:
print("Размеры выборок:")
print(f"Train: {X_train.shape[0]}")
print(f"Val:   {X_val.shape[0]}")
print(f"Test:  {X_test.shape[0]}")

Размеры выборок:
Train: 1797
Val:   599
Test:  600


In [39]:
for cluster_num in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_num]

    print(f"\n Кластер {cluster_num} (статей: {len(cluster_data)})")
    print("━" * 50)

    print("\n Топ-3 статьи:")
    for name in cluster_data['name'].head(3):
        print(f"  {name}")

    if 'organization' in df:
        org_counts = cluster_data['organization'].value_counts().head(3)
        print("\n Топ организаций:")
        for org, count in org_counts.items():
            print(f"  {org}: {count} статей")

    avg_length = cluster_data['text_length'].mean()
    print(f"\n Средняя длина текста: {avg_length:.1f} слов")

    if 'processed_text' in df:
        vectorizer = TfidfVectorizer(max_features=5)
        X_cluster = vectorizer.fit_transform(cluster_data['processed_text'])
        keywords = ", ".join(vectorizer.get_feature_names_out())
        print(f"\n Ключевые слова: {keywords}")

    sample_text = cluster_data.iloc[0]['text'][:100] + "..."
    print(f"\n Пример текста:\n  «{sample_text}»")

    print("━" * 50)


 Кластер 0 (статей: 467)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 Топ-3 статьи:
  SCP-007
  SCP-012
  SCP-013

 Топ организаций:
  SCP Foundation: 467 статей

 Средняя длина текста: 1084.7 слов

 Ключевые слова: effect, licensing, see, subject, test

 Пример текста:
  «rating: +711+–x
Item #: SCP-007
Object Class: Euclid
Special Containment Procedures: SCP-007 is to b...»
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 Кластер 1 (статей: 191)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 Топ-3 статьи:
  SCP-019
  SCP-021
  SCP-059

 Топ организаций:
  SCP Foundation: 191 статей

 Средняя длина текста: 1010.9 слов

 Ключевые слова: containment, host, licensing, specimen, subject

 Пример текста:
  «rating: +572+–x


SCP-019


Item #: SCP-019
Object Class: Keter
Special Containment Procedures: SCP-...»
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 Кластер 2 (статей: 1397)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 Топ-3 статьи:
  SCP-001
  SCP-00