## Visualizations

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import KElbowVisualizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string
from umap import UMAP
import json

plt.style.use('dark_background')

In [2]:
df = pd.read_csv('../data/stepik_parsed.csv')

### Prices

In [32]:
# Гистограмма без выбросов
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR

main_data = df[df['price'] <= upper_bound]

fig = px.histogram(
    main_data,
    x='price',
    nbins=50,
    title=f'<b>Распределение цен на курсы (без выбросов)</b><br><sub>Граница выбросов: {int(upper_bound)} руб. | Всего курсов: {len(main_data)}</sub>',
    labels={'price': 'Цена (рубли)', 'count': 'Количество курсов'},
    color_discrete_sequence=['#29b6f6'],
    opacity=0.85,
    marginal=None, 
    width=1000,
    height=600
)

fig.update_layout(
    hovermode='x unified',
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    title_font=dict(size=18),
    template='plotly_white',
    bargap=0.1,
    showlegend=False,
    annotations=[
        dict(
            x=0.95,
            y=0.95,
            xref='paper',
            yref='paper',
            text=f"<b>Статистика:</b><br>Медиана: {main_data['price'].median():.0f} руб.<br>Среднее: {main_data['price'].mean():.0f} руб.",
            showarrow=False,
            align='left',
            bordercolor='black',
            borderwidth=1,
            borderpad=4,
            bgcolor='white'
        )
    ]
)

fig.update_traces(
    hovertemplate="<b>Диапазон:</b> %{x} руб.<br><b>Количество курсов:</b> %{y}",
    marker_line_width=1,
    marker_line_color='white'
)

for price, name, color in [
    (main_data['price'].median(), 'Медиана', '#ff7043'),
    (main_data['price'].mean(), 'Среднее', '#66bb6a')
]:
    fig.add_vline(
        x=price,
        line_dash="dash",
        line_color=color,
        annotation_text=name,
        annotation_position="top",
        annotation_font_color=color
    )

fig.write_html(
    "../plots/price_distribution_main.html",
    auto_play=False,
    include_plotlyjs='cdn'
)

### Popularity

In [4]:
top20 = df.sort_values('students', ascending=False).head(20)

In [35]:
plt.figure(figsize=(10, 6))
sns.barplot(data=top20, y='title', x='students')
plt.title('Топ-20 курсов по количеству студентов', pad=20)
plt.xlabel('Количество студентов')
plt.ylabel('')
plt.tight_layout()
plt.grid(True, linestyle=':', linewidth=0.6)
plt.savefig('../plots/top20_popular.png', dpi=120, bbox_inches='tight')
plt.close()

### Word Cloud

In [6]:
nltk.download('stopwords')
russian_stopwords = set(stopwords.words('russian'))

custom_stopwords = {
    'курс', 'курса', 'курсе', 'курсов', 'научитесь', 'научиться', 
    'изучение', 'изучить', 'будете', 'можно', 'весь', 'свой',
    'вас', 'вам', 'это', 'этого', 'которые', 'который', "также",
    "основы", "также", "языка", "язык", "основы", "позволит", "ещё",
    "содержит", "изучения", "всего", "всему", "всем", "своего",
    "своему", "своим", "своем", "своя", "своей", "свою", "свое",
    "этому", "этот", "этим", "эта", "жту", "этой", "эти", "этим",
    "этими", "этих", "которых", "которыми", "которого", "которому",
    "которым", "котором", "которая", "которую", "которой", "которых",
    "которым", "которыми", "которое", "вы", "тех", "хочет", "кто", "еще",
    "поможет", "узнаете", "предназначен", "языку", "свои", "посвящен",
    "начать", "часть", "заданий", "самых", "каждый", "помощью", "языке",
    "чему", "сможете", "понимать", "создавать", "изучите", "stepik"
}

all_stopwords = set(stopwords.words('russian')).union(custom_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
# с Tf-Idf
tfidf = TfidfVectorizer(
    max_features=150,
    stop_words=list(all_stopwords),
    preprocessor=lambda x: ''.join([c for c in x.lower() if c not in string.punctuation and not c.isdigit()])
)

tfidf_matrix = tfidf.fit_transform(df['title'] + ' ' + df['about'])
feature_names = tfidf.get_feature_names_out()

mean_tfidf = tfidf_matrix.mean(axis=0).A1
word_weights = dict(zip(feature_names, mean_tfidf))

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='black',
    colormap='viridis',
    contour_width=1,
    contour_color='steelblue'
).generate_from_frequencies(word_weights)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()

plt.savefig('../plots/wordcloud_tfidf.png', dpi=300, bbox_inches='tight', pad_inches=0.1)
plt.close()

In [23]:
# без Tf-Idf
text = ' '.join(df['title'] + ' ' + df['about'])

def preprocess_text(text):
    text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
    words = text.lower().split()
    return [word for word in words if word not in all_stopwords and len(word) > 2]

words = preprocess_text(text)
word_freq = Counter(words)

for common_word in [w for w, _ in word_freq.most_common(8)]:
    del word_freq[common_word]

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='black',
    stopwords=all_stopwords,
    colormap='viridis',
    max_words=150,
    contour_width=1,
    contour_color='steelblue'
).generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()

plt.savefig('../plots/wordcloud_filtered.png', dpi=300, bbox_inches='tight', pad_inches=0.1)
plt.close()

### Clusterization (K-Means) 

In [9]:
model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', trust_remote_code=True)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
df['combined_text'] = df.apply(lambda x: ' '.join(filter(None, [str(x['title']), str(x['about']), str(x['skills'])])), axis=1)

In [11]:
embeddings = model.encode(df['combined_text'].tolist())

In [42]:
def determine_optimal_clusters(embeddings, max_k=15):
    elbow_visualizer = KElbowVisualizer(
        KMeans(random_state=42),
        k=(2, max_k),
        metric='distortion',
        timings=False
    )
    plt.grid(True, linestyle=':', linewidth=0.6)
    elbow_visualizer.fit(embeddings)
    elbow_k = elbow_visualizer.elbow_value_
    elbow_visualizer.show(outpath="../plots/elbow_method.png")
    plt.close()
    
    silhouette_scores = []
    k_values = range(2, max_k+1)
    
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(embeddings)
        silhouette_scores.append(silhouette_score(embeddings, labels))
    
    best_silhouette_k = k_values[np.argmax(silhouette_scores)]
    best_silhouette_score = max(silhouette_scores)
    
    plt.figure(figsize=(7, 5))
    plt.plot(k_values, silhouette_scores, 'bo-')
    plt.grid(True, linestyle=':', linewidth=0.6)
    plt.xlabel('Number of clusters')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Analysis for Optimal k')
    plt.axvline(x=best_silhouette_k, color='r', linestyle='--')
    plt.text(best_silhouette_k+0.5, 0.1, 
             f'Optimal k={best_silhouette_k} (score={best_silhouette_score:.3f})', 
             color='red')
    plt.savefig("../plots/silhouette_analysis.png")
    plt.close()
    
    if elbow_k != best_silhouette_k:
        final_k = int(np.round((elbow_k + best_silhouette_k)/2))
        print(f"Метод локтя предложил k={elbow_k}, silhouette - k={best_silhouette_k}")
        print(f"Компромиссное значение: k={final_k}")
    else:
        final_k = elbow_k
    
    explanation = f"""
    #### Обоснование выбора числа кластеров (k={final_k})

    ##### 1. Метод локтя
    На графике наблюдается изгиб при k={elbow_k}.

    ##### 2. Silhouette Score
    Максимальное значение silhouette score ({best_silhouette_score:.3f}) достигается при k={best_silhouette_k}.

    ##### 3. Согласование результатов
    - Метод локтя: k={elbow_k}
    - Silhouette: k={best_silhouette_k}
    - **Выбранное значение**: k={final_k} (среднее округленное)
    """

    with open("../plots/cluster_choice_explanation.md", "w") as f:
        f.write(explanation)
    
    return final_k

In [43]:
optimal_k = determine_optimal_clusters(embeddings)

Метод локтя предложил k=9, silhouette - k=14
Компромиссное значение: k=12


In [14]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings)

final_score = silhouette_score(embeddings, df['cluster'])
print(f"Silhouette Score для k={optimal_k}: {final_score:.3f}")

Silhouette Score для k=12: 0.055


In [15]:
df.to_csv("../data/stepik_analyzed.csv", index=False)

### UMAP 

In [16]:
df = pd.read_csv('../data/stepik_analyzed.csv')

In [49]:
umap_3d = UMAP(
    n_components=3,
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine'
)
umap_3d_embeds = umap_3d.fit_transform(embeddings)

df[['x', 'y', 'z']] = umap_3d_embeds

cluster_colors = px.colors.qualitative.Plotly

fig_3d = go.Figure()

for cluster in sorted(df['cluster'].unique()):
    cluster_df = df[df['cluster'] == cluster]
    fig_3d.add_trace(
        go.Scatter3d(
            x=cluster_df['x'],
            y=cluster_df['y'],
            z=cluster_df['z'],
            mode='markers',
            marker=dict(
                size=7,
                color=cluster_colors[cluster % len(cluster_colors)],
                opacity=0.9,
                line=dict(width=1.5, color='#333') 
            ),
            name=f'Кластер {cluster}',
            hovertext=cluster_df.apply(
                lambda row: f"{row['title']}<br>Цена: {row['price']} руб<br>Студентов: {row['students']:,}",
                axis=1
            ),
            hoverinfo='text'
        )
    )

fig_3d.update_layout(
    scene=dict(
        xaxis_title='Ось X',
        yaxis_title='Ось Y',
        zaxis_title='Ось Z',
        xaxis=dict(
            backgroundcolor='rgba(240,240,240,0.5)',
            gridcolor='black', 
            gridwidth=2,       
            showbackground=True,
            linecolor='black', 
            linewidth=3        
        ),
        yaxis=dict(
            backgroundcolor='rgba(240,240,240,0.5)',
            gridcolor='black',
            gridwidth=2,
            showbackground=True,
            linecolor='black',
            linewidth=3
        ),
        zaxis=dict(
            backgroundcolor='rgba(240,240,240,0.5)',
            gridcolor='black',
            gridwidth=2,
            showbackground=True,
            linecolor='black',
            linewidth=3
        ),
        camera=dict(
            up=dict(x=0, y=0, z=1),
            center=dict(x=0, y=0, z=0),
            eye=dict(x=1.5, y=1.5, z=0.6)
        ),
        bgcolor='white' 
    ),
    width=1100,
    height=700,
    legend=dict(
        title=dict(text='<b>Кластеры</b>', font=dict(size=14)),
        itemsizing='constant',
        font=dict(size=12)
    ),
    margin=dict(l=0, r=0, b=0, t=50),
    paper_bgcolor='white',
    font=dict(family="Arial", color="black")
)

fig_3d.update_traces(
    marker=dict(
        sizemode='diameter',
        sizeref=0.1,
        symbol='circle'
    )
)

fig_3d.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            buttons=[
                dict(
                    label="Сброс",
                    method="relayout",
                    args=["scene.camera", dict(
                        up=dict(x=0, y=0, z=1),
                        center=dict(x=0, y=0, z=0),
                        eye=dict(x=1.5, y=1.5, z=0.6)
                    )]
                )
            ],
            direction="left",
            pad={"r": 10, "t": 10},
            showactive=False,
            x=0.05,
            xanchor="left",
            y=0,
            yanchor="top"
        )
    ]
)

fig_3d.write_html("../plots/umap_3d_visualization.html", include_plotlyjs='cdn')

### Word Cloud for Clusters

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

vectorizer = TfidfVectorizer(max_features=100, stop_words=list(all_stopwords))
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
feature_names = vectorizer.get_feature_names_out()

top_words_per_cluster = {}
for cluster in sorted(df['cluster'].unique()):
    cluster_texts = df[df['cluster'] == cluster]['combined_text']
    tfidf_matrix_cluster = vectorizer.transform(cluster_texts)
    sum_tfidf = tfidf_matrix_cluster.sum(axis=0).A1
    top_indices = sum_tfidf.argsort()[-30:][::-1]  
    top_words = [(feature_names[i], float(sum_tfidf[i])) for i in top_indices]
    top_words_per_cluster[cluster] = top_words

In [53]:
from wordcloud import WordCloud

for cluster, words in top_words_per_cluster.items():
    word_freq = dict(words)
    wc = WordCloud(width=600, height=300, background_color='black')
    wc.generate_from_frequencies(word_freq)
    
    plt.figure(figsize=(8, 4))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Кластер {cluster}: Ключевые слова", pad=12)
    plt.savefig(f"../plots/cluster_{cluster}_wordcloud.png", bbox_inches='tight')
    plt.close()

In [None]:
top_words_per_cluster_strkeys = {int(k): v for k, v in top_words_per_cluster.items()}

with open('../data/cluster_top_words.json', 'w', encoding='utf-8') as f:
    json.dump(top_words_per_cluster_strkeys, f, ensure_ascii=False, sort_keys=True, indent=4)