In [1]:
import pandas as pd
from sklearn.cluster import DBSCAN, OPTICS
from collections import Counter
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 200)

In [2]:
comments = pd.read_csv('comments_features.csv').drop(columns=['Unnamed: 0'])
X = comments.drop(columns=['text', 'likes', 'text_cleaned'])

In [3]:
%%time
def cluster(X, method, params, label, comments):
    clusters = method(**params).fit(X)
    comments[label] = clusters.labels_
    print(Counter(clusters.labels_))
    return comments
    
n = 5000
X = X[:n]
comments = comments.loc[:n-1, ['text']]
comments_labeled = cluster(X, DBSCAN, {'eps':2.2, 'min_samples': 5}, 'DBSCAN', comments)

Counter({1: 3228, 0: 1002, -1: 740, 2: 16, 3: 6, 5: 4, 4: 4})
CPU times: total: 17.7 s
Wall time: 17.7 s


### Cluster 0 is about Kazakhstan 8-16 and covid ~4

In [4]:
comments_labeled[comments_labeled.DBSCAN == 0][:20]

Unnamed: 0,text,DBSCAN
0,"–ê —Å–µ–π—á–∞—Å, —Å–≤–µ–∂–∞—è —Ä—É–±—Ä–∏–∫–∞: ""–ö–æ—Ä–æ–Ω–∞–Ω–æ–≤–æ—Å—Ç–∏""",0
1,–ê –ø–æ—á–µ–º—É —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –Ω–∞—á–∞–ª–∞ —Ä–æ—Å—Ç–∏ ? –í–∞–∫—Ü–∏–Ω–∞ –ø–µ—Ä–µ—Å—Ç–∞–ª–∞ –ø–æ–º–æ–≥–∞—Ç—å ?,0
2,"–ò –≤—Å—ë –Ω–µ —Ç–µ –º—Ä—É—Ç, –∫—Ç–æ –Ω–∞–¥–æ.",0
6,–ß—ë —Ç–æ –≤—Å—ë —Ä–µ–∑–∫–æ –±—É—Å—Ç–∞–Ω—É–ª–æ—Å—å,0
10,–ó–µ–ª—é –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∏. –ë—É–¥–µ—Ç –∫–∞–∫ –≤ –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω–µ,0
12,"–ù–æ –Ω–µ —Å–∫–∞–∑–∞–ª–∏ —Å–∞–º–æ–µ –≥–ª–∞–≤–Ω–æ–µ, —É–π–¥–µ—Ç –ª–∏ –ù–∞–∑–∞—Ä–±–∞–µ–≤",0
15,"–ê —Å–µ–π—á–∞—Å, —Å–≤–µ–∂–∞—è —Ä—É–±—Ä–∏–∫–∞: ""–ö–æ—Ä–æ–Ω–∞–Ω–æ–≤–æ—Å—Ç–∏""",0
16,–ê –ø–æ—á–µ–º—É —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –Ω–∞—á–∞–ª–∞ —Ä–æ—Å—Ç–∏ ? –í–∞–∫—Ü–∏–Ω–∞ –ø–µ—Ä–µ—Å—Ç–∞–ª–∞ –ø–æ–º–æ–≥–∞—Ç—å ?,0
17,"–ò –≤—Å—ë –Ω–µ —Ç–µ –º—Ä—É—Ç, –∫—Ç–æ –Ω–∞–¥–æ.",0
29,–ß—ë —Ç–æ –≤—Å—ë —Ä–µ–∑–∫–æ –±—É—Å—Ç–∞–Ω—É–ª–æ—Å—å,0


### Cluster 1 - COVID19
As we can see 15 out of 20 comments are about COVID19

In [5]:
comments_labeled[comments_labeled.DBSCAN == 1][:20]

Unnamed: 0,text,DBSCAN
3,"–õ–µ–æ–Ω–∏–¥–∞ –ö—É—Ä–∞–≤–ª—ë–≤–∞ –≥–æ—Å–ø–∏—Ç–∞–ª–∏–∑–∏—Ä–æ–≤–∞–ª–∏ 5 —è–Ω–≤–∞—Ä—è –≤ –≤–∏—Ä—É—Å—è—Ç–Ω–∏–∫ –ü—Ä–æ—Ü–µ–Ω–∫–æ. –õ–µ–ø–∏–ª—ã-—É–±–∏–π—Ü—ã —Å–æ–æ–±—â–∞—é—Ç, —à–æ –∞—Ä—Ç–∏—Å—Ç –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –≤ –æ—Ç–¥–µ–ª–µ–Ω–∏–∏ —Ä–µ–∞–Ω–∏–º–∞—Ü–∏–∏ –≤ —Ç—è–∂–µ–ª–æ–º —Å–æ—Å—Ç–æ—è–Ω–∏–∏, –∫–æ—Ç–æ—Ä–æ–µ –Ω–µ –º–µ–Ω—è–µ—Ç—Å—è —É–∂–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ –¥–Ω–µ–π.",1
4,"–¢–∞–∫ —ç—Ç–æ –≤—Å—ë —Å–º–µ—à–Ω–æ, —Ç–æ —á—Ç–æ –≤ –†–æ—Å—Å–∏–∏ –≤—Å—ë –ø–æ —Å—Ü–µ–Ω–∞—Ä–∏—é..... –°–µ–π—á–∞—Å –Ω–∞–≤–µ—Ä–Ω–æ–µ —Ä–æ—Å—Ç –¥–æ–ª–≥–æ –±—É–¥–µ—Ç –¥–æ 9 –º–∞—è, –∞ –Ω–∞ 9 –º–∞—è —Å–Ω–æ–≤–∞ –ø–æ–±–µ–¥–∏–º –∫–æ—Ä–æ–Ω–æ–≤–∏—Ä—É—Å",1
7,"–í–∏—Ä—É—Å–æ–ª–æ–≥: –ì–∏–±—Ä–∏–¥–Ω–æ–≥–æ –≤–∞—Ä–∏–∞–Ω—Ç–∞ –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ ¬´–¥–µ–ª—å—Ç–∞–∫—Ä–æ–Ω¬ª –Ω–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç, –æ–Ω –Ω–µ —Å–º–æ–∂–µ—Ç —Ç–µ–±–µ –Ω–∞–≤—Ä–µ–¥–∏—Ç—å –ì–∏–±—Ä–∏–¥–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ ¬´–¥–µ–ª—å—Ç–∞–∫—Ä–æ–Ω¬ª:",1
8,"–ê —ç—Ç–æ –Ω–µ –≤ –°–®–ê –≤—Å–µ ""–≤–∞–∫—Ü–∏–Ω–∏—Ä–æ–≤–∞–Ω—ã"" –ø–æ —Å–∞–º–æ–µ –Ω–µ —Ö–æ—á—É?",1
11,–ù–æ —Å 1 –∏—é–ª—è —Ü–µ–Ω—ã –Ω–∞ —Ç–æ–ø–ª–∏–≤–æ –≤—Å—ë —Ä–∞–≤–Ω–æ –ø–æ–¥–Ω–∏–º—É—Ç,1
13,"–ü–ø—Ü, –Ω–∞ —á—É–∂–∏—Ö —à—Ç—ã–∫–∞—Ö —É–¥–µ—Ä–∂–∞–ª—Å—è —É –≤–ª–∞—Å—Ç–∏",1
14,"–ü—Ä–∏—à–ª–∏, –ø–æ—Ç–æ–ø—Ç–∞–ª–∏, –ø–æ–ø–ª–µ–≤–∞–ª–∏ –Ω–∞ –∑–µ–º–ª—é –∏ –æ–±—Ä–∞—Ç–Ω–æ –¥–æ–º–æ–π, –≤–æ—Ç –∏ –≤—Å—è ¬´–∫–æ–Ω—Ç—Ä-—Ç–µ—Ä—Ä–æ—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∞—è –æ–ø–µ—Ä–∞—Ü–∏—è¬ª :)",1
18,"–õ–µ–æ–Ω–∏–¥–∞ –ö—É—Ä–∞–≤–ª—ë–≤–∞ –≥–æ—Å–ø–∏—Ç–∞–ª–∏–∑–∏—Ä–æ–≤–∞–ª–∏ 5 —è–Ω–≤–∞—Ä—è –≤ –≤–∏—Ä—É—Å—è—Ç–Ω–∏–∫ –ü—Ä–æ—Ü–µ–Ω–∫–æ. –õ–µ–ø–∏–ª—ã-—É–±–∏–π—Ü—ã —Å–æ–æ–±—â–∞—é—Ç, —à–æ –∞—Ä—Ç–∏—Å—Ç –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –≤ –æ—Ç–¥–µ–ª–µ–Ω–∏–∏ —Ä–µ–∞–Ω–∏–º–∞—Ü–∏–∏ –≤ —Ç—è–∂–µ–ª–æ–º —Å–æ—Å—Ç–æ—è–Ω–∏–∏, –∫–æ—Ç–æ—Ä–æ–µ –Ω–µ –º–µ–Ω—è–µ—Ç—Å—è —É–∂–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ –¥–Ω–µ–π.",1
19,"–¢–∞–∫ —ç—Ç–æ –≤—Å—ë —Å–º–µ—à–Ω–æ, —Ç–æ —á—Ç–æ –≤ –†–æ—Å—Å–∏–∏ –≤—Å—ë –ø–æ —Å—Ü–µ–Ω–∞—Ä–∏—é..... –°–µ–π—á–∞—Å –Ω–∞–≤–µ—Ä–Ω–æ–µ —Ä–æ—Å—Ç –¥–æ–ª–≥–æ –±—É–¥–µ—Ç –¥–æ 9 –º–∞—è, –∞ –Ω–∞ 9 –º–∞—è —Å–Ω–æ–≤–∞ –ø–æ–±–µ–¥–∏–º –∫–æ—Ä–æ–Ω–æ–≤–∏—Ä—É—Å",1
20,"–£–º–Ω–æ–∂–∞–π –Ω–∞ —Ç—Ä–∏, –Ω–µ –ø—Ä–æ–º–∞—Ö–Ω—ë—à—å—Å—è",1


In [6]:
from sklearn.cluster import KMeans
clusters = KMeans().fit(X[:n])
comments_labeled = cluster(X, KMeans, {'n_clusters': 3, 'random_state': 42}, 'Kmeans', comments_labeled)

Counter({1: 2098, 0: 1700, 2: 1202})


### Cluster 0 20/20 is about COVID 19 

In [35]:
comments_labeled[comments_labeled.Kmeans == 0][20:40]

Unnamed: 0,text,DBSCAN,Kmeans
54,–ù–∞ —ç—Ç—É –º–∏—Å—Å–∏—é –û–î–ö–ë –¥–ª–∏–Ω–æ–π –≤ 4 –¥–Ω—è –ø–æ—à–ª–∏ –Ω–∞—à–∏ —Å –≤–∞–º–∏ –Ω–∞–ª–æ–≥–∏. –í–æ—Ç –∫–∞–∫ –ø—Ä–µ–∫—Ä–∞—Å–Ω–æ –¥—Ä—É–∂–∏—Ç—å —Å –¥–∏–∫—Ç–∞—Ç–æ—Ä–∞–º–∏.,1,0
55,–õ—É—á—à–µ–µ –∑–∞ —Å–µ–≥–æ–¥–Ω—è—à–Ω–µ–µ –≤—ã—Å—Ç—É–ø–ª–µ–Ω–∏–µ –¢–æ–∫–∞–µ–≤–∞ *–µ—â—ë –±–æ–ª–µ–µ –≤–∞–∂–Ω—ã—Ö –∏ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã—Ö —Ä–µ—Ñ–æ—Ä–º –Ω–µ —É–∫–∞–∑–∞–ª–∏ –≤ –ø–æ—Å—Ç–µ. –ù—É –≤ –ø—Ä–∏–Ω—Ü–∏–ø–µ –æ–Ω–∏ –≤–∞–∂–Ω—ã –∏ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã —Ç–æ–ª—å–∫–æ –∫–∞–∑–∞—Ö—Å—Ç–∞–Ω—Ü–∞–º,1,0
56,"–ù–∏—á–µ–≥–æ –Ω–µ –ø–æ–Ω—è–ª, –ø–æ—á–µ–º—É –Ω–∞ —Å—Ç—Ä–∞–Ω—É –Ω–∞–ø–∞–ª–∏ —Ç–µ—Ä—Ä–æ—Ä–∏—Å—Ç—ã, –∞ ""–ø—Ä–µ–∑–∏–¥–µ–Ω—Ç —Å—Ç—Ä–∞–Ω—ã –≤–≤—ë–ª –ø—è—Ç–∏–ª–µ—Ç–Ω–∏–π –º–æ—Ä–∞—Ç–æ—Ä–∏–π –Ω–∞ –ø–æ–≤—ã—à–µ–Ω–∏–µ –∑–∞—Ä–ø–ª–∞—Ç –º–∏–Ω–∏—Å—Ç—Ä–∞–º –∏ –¥–µ–ø—É—Ç–∞—Ç–∞–º"" - —ç—Ç–æ –≤—Å–µ–≥–¥–∞ —Ç–∞–∫ —Ä–∞–±–æ—Ç–∞–µ—Ç –∫–æ–≥–¥–∞ –Ω–∞ —Å—Ç—Ä–∞–Ω—É –Ω–∞–ø–∞–¥–∞—é—Ç —Ç–µ...",1,0
58,"–ü–ª–∞–Ω–∏—Ä—É–µ—Ç—Å—è –ª–∏ –≤–≤–æ–¥ –≤–æ–π—Å–∫ –û–î–ö–ë –≤ –†–æ—Å—Å–∏—é –¥–ª—è –ø—Ä–µ–≤–µ–Ω—Ç–∏–≤–Ω–æ–π –∑–∞—â–∏—Ç—ã –æ—Ç –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã—Ö —Ç–µ—Ä—Ä–æ—Ä–∏—Å—Ç–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –≤–µ—Ä–æ–ª–æ–º–Ω–æ –ø–æ–≤—ã—à–∞—é—Ç —Ü–µ–Ω—ã? –í–æ–∑–º–æ–∂–Ω–æ —Ç–∞–∫–∂–µ —Å–ª–µ–¥—É–µ—Ç –∑–∞—Ä–∞–Ω–µ–µ –≤–∑—è—Ç—å –ø–æ–¥ —Å—Ç—Ä–∞–∂—É –ø–∏–∞–Ω–∏—Å—Ç–æ–≤, –ø–æ–∫–∞ –æ–Ω–∏ –≤...",-1,0
62,"–ù—É —Ä–∞–∑ –∑–∞—Ä–ø–ª–∞—Ç—ã –Ω–µ –±—É–¥—É—Ç —Ä–∞—Å—Ç–∏ —É —á–∏–Ω–æ–≤–Ω–∏–∫–æ–≤, —Ç–æ –≤–æ–∑—Ä–∞—Å—Ç—ë—Ç –º–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —Å—É–º–º–∞ –≤–∑—è—Ç–æ–∫, –ª–æ–ª.",1,0
64,"–ü–æ–ª—É—á–∞–µ—Ç—Å—è, —á—Ç–æ –¢–æ–∫–∞–µ–≤ –≤—ã–ø–æ–ª–Ω—è–µ—Ç —Ç—Ä–µ–±–æ–≤–∞–Ω–∏—è —Ç–µ—Ä—Ä–æ—Ä–∏—Å—Ç–æ–≤: 1) –Ω–∞—Ä—É—à–∏–ª –ö–æ–Ω—Å—Ç–∏—Ç—É—Ü–∏—é, —Å–º–µ—Å—Ç–∏–≤ –ù–∞–∑–∞—Ä–±–∞–µ–≤–∞ 2) —Å–Ω–∏–∑–∏–ª —Ü–µ–Ω—ã –Ω–∞ –≥–∞–∑ 3) –≤ –æ—Ç—Å—Ç–∞–≤–∫—É –æ—Ç–ø—Ä–∞–≤–∏–ª –ø—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ 4) –¥—Ä—É–≥–∏–µ –ø–ª—é—à–∫–∏",1,0
66,–í–æ–π—Å–∫–∞ –ø—É —Å–æ —à–≤–∞–±—Ä–∞–º–∏ –ø–æ–ø—Ä–æ—Å–∏–ª–∏ –Ω–∞ –≤—ã—Ö–æ–¥ü§£üòÇ,1,0
68,–ü—Ä–µ–¥–ª–∞–≥–∞—é –æ—Å—Ç–∞–≤–∏—Ç—å –Ω–∞—à–∏ —Ç–∞–Ω–∫–∏ –≤ —Å–µ–≤–µ—Ä–Ω–æ–º –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω–µ –ù–∞–º –æ–Ω –Ω—É–∂–Ω–µ–µ,1,0
69,"–¢–æ–∫–∞–µ–≤—É , —Å—Ç–∞–≤–ª–µ–Ω–Ω–∏–∫—É –ù–∞–∑–∞—Ä–±–∞–µ–≤–∞, —Ç–µ–ø–µ—Ä—å –ø—Ä–∏–¥—ë—Ç—Å—è –ø—Ä–æ–≤–æ–¥–∏—Ç—å —Å–≤–æ—é –ø–æ–ª–∏—Ç–∏–∫—É –≤ —Å—Ç—Ä–∞–Ω–µ, –Ω–µ –æ–≥–ª—è–¥—ã–≤–∞—è—Å—å –Ω–∞ —à–µ—Ñ–∞...–†–∞–∑—É–º–µ–µ—Ç—Å—è, –µ—Å–ª–∏ –æ–Ω –Ω–∞–º–µ—Ä–µ–Ω –æ—Å—Ç–∞—Ç—å—Å—è –ø—Ä–µ–∑–∏–¥–µ–Ω—Ç–æ–º –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω–∞...",1,0
73,"–õ–µ–Ω—Ç–∞—á- –ø–æ–∑–æ—Ä–Ω–æ–µ —É–π–æ–±–∏—â–µ. –í —Ç–µ–ª–µ–≥–µ –±–∞–Ω—è—Ç –≤—Å–µ—Ö, –∫—Ç–æ –∏–º–µ–µ—Ç –ø—Ä–æ—Ç–∏–≤–æ–ø–æ–ª–æ–∂–Ω–æ–µ –º–Ω–µ–Ω–∏–µ, —Ç–∞–º –æ–¥–Ω–∏ –º–∞–π–¥–∞–Ω—É—Ç—ã–µ –∫–∞–∫–ª—ã –∏ –ª–∏–±–µ—Ä–∞—Å—Ç—ã –æ—Å—Ç–∞–ª–∏—Å—å)",1,0


### Cluster 1 seems also about covid19 and other

In [9]:
comments_labeled[comments_labeled.Kmeans == 1][:20]

Unnamed: 0,text,DBSCAN,Kmeans
4,"–¢–∞–∫ —ç—Ç–æ –≤—Å—ë —Å–º–µ—à–Ω–æ, —Ç–æ —á—Ç–æ –≤ –†–æ—Å—Å–∏–∏ –≤—Å—ë –ø–æ —Å—Ü–µ–Ω–∞—Ä–∏—é..... –°–µ–π—á–∞—Å –Ω–∞–≤–µ—Ä–Ω–æ–µ —Ä–æ—Å—Ç –¥–æ–ª–≥–æ –±—É–¥–µ—Ç –¥–æ 9 –º–∞—è, –∞ –Ω–∞ 9 –º–∞—è —Å–Ω–æ–≤–∞ –ø–æ–±–µ–¥–∏–º –∫–æ—Ä–æ–Ω–æ–≤–∏—Ä—É—Å",1,1
8,"–ê —ç—Ç–æ –Ω–µ –≤ –°–®–ê –≤—Å–µ ""–≤–∞–∫—Ü–∏–Ω–∏—Ä–æ–≤–∞–Ω—ã"" –ø–æ —Å–∞–º–æ–µ –Ω–µ —Ö–æ—á—É?",1,1
9,–¶–∏—Ä–∫. –£–∂–µ —Å–∞–º–∏ –≤ –ø–æ–∫–∞–∑–∞–Ω–∏—è—Ö –ø—É—Ç–∞—é—Ç—Å—è.,-1,1
11,–ù–æ —Å 1 –∏—é–ª—è —Ü–µ–Ω—ã –Ω–∞ —Ç–æ–ø–ª–∏–≤–æ –≤—Å—ë —Ä–∞–≤–Ω–æ –ø–æ–¥–Ω–∏–º—É—Ç,1,1
13,"–ü–ø—Ü, –Ω–∞ —á—É–∂–∏—Ö —à—Ç—ã–∫–∞—Ö —É–¥–µ—Ä–∂–∞–ª—Å—è —É –≤–ª–∞—Å—Ç–∏",1,1
14,"–ü—Ä–∏—à–ª–∏, –ø–æ—Ç–æ–ø—Ç–∞–ª–∏, –ø–æ–ø–ª–µ–≤–∞–ª–∏ –Ω–∞ –∑–µ–º–ª—é –∏ –æ–±—Ä–∞—Ç–Ω–æ –¥–æ–º–æ–π, –≤–æ—Ç –∏ –≤—Å—è ¬´–∫–æ–Ω—Ç—Ä-—Ç–µ—Ä—Ä–æ—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∞—è –æ–ø–µ—Ä–∞—Ü–∏—è¬ª :)",1,1
19,"–¢–∞–∫ —ç—Ç–æ –≤—Å—ë —Å–º–µ—à–Ω–æ, —Ç–æ —á—Ç–æ –≤ –†–æ—Å—Å–∏–∏ –≤—Å—ë –ø–æ —Å—Ü–µ–Ω–∞—Ä–∏—é..... –°–µ–π—á–∞—Å –Ω–∞–≤–µ—Ä–Ω–æ–µ —Ä–æ—Å—Ç –¥–æ–ª–≥–æ –±—É–¥–µ—Ç –¥–æ 9 –º–∞—è, –∞ –Ω–∞ 9 –º–∞—è —Å–Ω–æ–≤–∞ –ø–æ–±–µ–¥–∏–º –∫–æ—Ä–æ–Ω–æ–≤–∏—Ä—É—Å",1,1
20,"–£–º–Ω–æ–∂–∞–π –Ω–∞ —Ç—Ä–∏, –Ω–µ –ø—Ä–æ–º–∞—Ö–Ω—ë—à—å—Å—è",1,1
21,"–ú–æ—Å–∫–≤–∞ –ø–æ—à–ª–∞ –≤ —Ä–æ—Å—Ç. –ü—Ä–∏–≤–µ—Ç, –æ–º–∏–∫—Ä–æ–Ω!",-1,1
22,–í–∞–º –Ω–µ –Ω–∞–¥–æ–µ–ª–∞ —ç—Ç–∞ –µ—Ä–µ—Å—å?,1,1


### cluster 2 is about Kazakhstan 8-16 and covid ~4

In [10]:
comments_labeled[comments_labeled.Kmeans == 2][:20]

Unnamed: 0,text,DBSCAN,Kmeans
0,"–ê —Å–µ–π—á–∞—Å, —Å–≤–µ–∂–∞—è —Ä—É–±—Ä–∏–∫–∞: ""–ö–æ—Ä–æ–Ω–∞–Ω–æ–≤–æ—Å—Ç–∏""",0,2
1,–ê –ø–æ—á–µ–º—É —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –Ω–∞—á–∞–ª–∞ —Ä–æ—Å—Ç–∏ ? –í–∞–∫—Ü–∏–Ω–∞ –ø–µ—Ä–µ—Å—Ç–∞–ª–∞ –ø–æ–º–æ–≥–∞—Ç—å ?,0,2
2,"–ò –≤—Å—ë –Ω–µ —Ç–µ –º—Ä—É—Ç, –∫—Ç–æ –Ω–∞–¥–æ.",0,2
6,–ß—ë —Ç–æ –≤—Å—ë —Ä–µ–∑–∫–æ –±—É—Å—Ç–∞–Ω—É–ª–æ—Å—å,0,2
10,–ó–µ–ª—é –ø—Ä–µ–¥—É–ø—Ä–µ–¥–∏–ª–∏. –ë—É–¥–µ—Ç –∫–∞–∫ –≤ –ö–∞–∑–∞—Ö—Å—Ç–∞–Ω–µ,0,2
12,"–ù–æ –Ω–µ —Å–∫–∞–∑–∞–ª–∏ —Å–∞–º–æ–µ –≥–ª–∞–≤–Ω–æ–µ, —É–π–¥–µ—Ç –ª–∏ –ù–∞–∑–∞—Ä–±–∞–µ–≤",0,2
15,"–ê —Å–µ–π—á–∞—Å, —Å–≤–µ–∂–∞—è —Ä—É–±—Ä–∏–∫–∞: ""–ö–æ—Ä–æ–Ω–∞–Ω–æ–≤–æ—Å—Ç–∏""",0,2
16,–ê –ø–æ—á–µ–º—É —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –Ω–∞—á–∞–ª–∞ —Ä–æ—Å—Ç–∏ ? –í–∞–∫—Ü–∏–Ω–∞ –ø–µ—Ä–µ—Å—Ç–∞–ª–∞ –ø–æ–º–æ–≥–∞—Ç—å ?,0,2
17,"–ò –≤—Å—ë –Ω–µ —Ç–µ –º—Ä—É—Ç, –∫—Ç–æ –Ω–∞–¥–æ.",0,2
29,–ß—ë —Ç–æ –≤—Å—ë —Ä–µ–∑–∫–æ –±—É—Å—Ç–∞–Ω—É–ª–æ—Å—å,0,2


### cluster Kmeans 2 is subclaster cluster 0 in DBSCAN

In [13]:
def cluster_difference(comments_labeled, clusters_names, clusters_numbers):
    cluster0 = comments_labeled[comments_labeled[clusters_names[0]] == clusters_numbers[0]].index
    cluster1 = comments_labeled[comments_labeled[clusters_names[1]] == clusters_numbers[1]].index
    intersection = cluster0.intersection(cluster1)
    diff0 = cluster0.difference(cluster1)
    diff1 = cluster1.difference(cluster0)
    return intersection, diff0, diff1

def cluster_similarity(func):
    def show_info(comments_labeled, clusters_names, clusters_numbers):
        intersection, diff0, diff1 = func(comments_labeled, clusters_names, clusters_numbers)
        print('intersection: {0:d}; {3:s} - {4:s} = {1:d}; {4:s} - {3:s} = {2:d}'.format(len(intersection), len(diff0), len(diff1), *clusters_names))
    return show_info  

clusters_names = ['DBSCAN', 'Kmeans']
clusters_numbers = [0, 2]
cluster_similarity(cluster_difference)(comments_labeled, clusters_names, clusters_numbers)

intersection: 1002; DBSCAN - Kmeans = 0; Kmeans - DBSCAN = 200


In [17]:
intersection, diff0, diff1 = cluster_difference(comments_labeled, clusters_names, clusters_numbers)
comments_labeled.iloc[diff1][:20]

Unnamed: 0,text,DBSCAN,Kmeans
88,–ü–æ —Ñ–∞–∫—Ç—É –Ω–∏—á–µ–≥–æ –Ω–µ –∏–∑–º–µ–Ω–∏–ª–∏ —Ç–æ–ª–∫–æ–º üòê,-1,2
188,–ü–æ —Ñ–∞–∫—Ç—É –Ω–∏—á–µ–≥–æ –Ω–µ –∏–∑–º–µ–Ω–∏–ª–∏ —Ç–æ–ª–∫–æ–º üòê,-1,2
252,–ù–∞–∫–æ–Ω–µ—Ü —Ç–æ –Ω–∞ –ª–µ–Ω—Ç–∞—á–µ –≥–æ–¥–Ω–∞—è –º—É–∑—ã–∫–∞.,-1,2
269,–≠—Ö –∫–∞—Å—Å–µ—Ç–Ω–∏–∫ —Å—Ç–æ–ª—å–∫–æ –ø–ª—ë–Ω–∫–∏ —Å –Ω–∏–º–∏ –ø–æ–∂–µ–≤–∞–ª.,-1,2
274,–ì–æ—Ä—à–æ–∫ –∫–∞–ª–∞.—Å–ø–∞—Å–∏–±–æ —á—Ç–æ –ø–∞–¥–æ—Ö,-1,2
283,–†–∞–∑ –¥–≤–∞ —Ç—Ä–∏ —á–µ—Ç—ã—Ä–µ –ø—è—Ç—å...,2,2
293,–†–∞–∑ –¥–≤–∞ —Ç—Ä–∏ —á–µ—Ç—ã—Ä–µ –ø—è—Ç—å,2,2
316,–í–æ—Ç —Ç–æ–ª—å–∫–æ —á—Ç–æ –ø–æ –¥–æ—Ä–æ–≥–µ –Ω–∞ —Ä–∞–±–æ—Ç—É —Å–ª—É—à–∞–ª –∏—Ö –ø–µ—Å–Ω–∏),-1,2
317,–ù–µ –ø–∞–Ω–∫ –≥—Ä—É–ø–ø–∞ –∞ –≥–æ–≤–Ω–æ—Ä–∏ ü§£‚òù,-1,2
325,–í—Å—ë –º—ã –≤ –¥–µ—Ç—Å—Ç–≤–µ –±—ã–ª–∏ –ø–∞–Ω–∫–∏ —Ö–æ—Ç—è –±—ã –≤ —Å–∞–º–æ–º —Ä–∞–Ω–Ω–µ–º ;)!,-1,2


In [27]:
clusters_names = ['DBSCAN', 'Kmeans']
clusters_numbers = [1, 0]
cluster_similarity(cluster_difference)(comments_labeled, clusters_names, clusters_numbers)
intersection, diff0, diff1 = cluster_difference(comments_labeled, clusters_names, clusters_numbers)
comments_labeled.iloc[diff0][:20]

intersection: 1386; DBSCAN - Kmeans = 1842; Kmeans - DBSCAN = 314


Unnamed: 0,text,DBSCAN,Kmeans
4,"–¢–∞–∫ —ç—Ç–æ –≤—Å—ë —Å–º–µ—à–Ω–æ, —Ç–æ —á—Ç–æ –≤ –†–æ—Å—Å–∏–∏ –≤—Å—ë –ø–æ —Å—Ü–µ–Ω–∞—Ä–∏—é..... –°–µ–π—á–∞—Å –Ω–∞–≤–µ—Ä–Ω–æ–µ —Ä–æ—Å—Ç –¥–æ–ª–≥–æ –±—É–¥–µ—Ç –¥–æ 9 –º–∞—è, –∞ –Ω–∞ 9 –º–∞—è —Å–Ω–æ–≤–∞ –ø–æ–±–µ–¥–∏–º –∫–æ—Ä–æ–Ω–æ–≤–∏—Ä—É—Å",1,1
8,"–ê —ç—Ç–æ –Ω–µ –≤ –°–®–ê –≤—Å–µ ""–≤–∞–∫—Ü–∏–Ω–∏—Ä–æ–≤–∞–Ω—ã"" –ø–æ —Å–∞–º–æ–µ –Ω–µ —Ö–æ—á—É?",1,1
11,–ù–æ —Å 1 –∏—é–ª—è —Ü–µ–Ω—ã –Ω–∞ —Ç–æ–ø–ª–∏–≤–æ –≤—Å—ë —Ä–∞–≤–Ω–æ –ø–æ–¥–Ω–∏–º—É—Ç,1,1
13,"–ü–ø—Ü, –Ω–∞ —á—É–∂–∏—Ö —à—Ç—ã–∫–∞—Ö —É–¥–µ—Ä–∂–∞–ª—Å—è —É –≤–ª–∞—Å—Ç–∏",1,1
14,"–ü—Ä–∏—à–ª–∏, –ø–æ—Ç–æ–ø—Ç–∞–ª–∏, –ø–æ–ø–ª–µ–≤–∞–ª–∏ –Ω–∞ –∑–µ–º–ª—é –∏ –æ–±—Ä–∞—Ç–Ω–æ –¥–æ–º–æ–π, –≤–æ—Ç –∏ –≤—Å—è ¬´–∫–æ–Ω—Ç—Ä-—Ç–µ—Ä—Ä–æ—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∞—è –æ–ø–µ—Ä–∞—Ü–∏—è¬ª :)",1,1
19,"–¢–∞–∫ —ç—Ç–æ –≤—Å—ë —Å–º–µ—à–Ω–æ, —Ç–æ —á—Ç–æ –≤ –†–æ—Å—Å–∏–∏ –≤—Å—ë –ø–æ —Å—Ü–µ–Ω–∞—Ä–∏—é..... –°–µ–π—á–∞—Å –Ω–∞–≤–µ—Ä–Ω–æ–µ —Ä–æ—Å—Ç –¥–æ–ª–≥–æ –±—É–¥–µ—Ç –¥–æ 9 –º–∞—è, –∞ –Ω–∞ 9 –º–∞—è —Å–Ω–æ–≤–∞ –ø–æ–±–µ–¥–∏–º –∫–æ—Ä–æ–Ω–æ–≤–∏—Ä—É—Å",1,1
20,"–£–º–Ω–æ–∂–∞–π –Ω–∞ —Ç—Ä–∏, –Ω–µ –ø—Ä–æ–º–∞—Ö–Ω—ë—à—å—Å—è",1,1
22,–í–∞–º –Ω–µ –Ω–∞–¥–æ–µ–ª–∞ —ç—Ç–∞ –µ—Ä–µ—Å—å?,1,1
23,"¬´–ó–ê–ë–£–î–¨, –£–î–ê–õ–Å–ù–ö–ò –ë–û–õ–¨–®–ï –ù–ï –ë–£–î–ï–¢¬ª - –≥–æ–≤–æ—Ä–∏—Ç –º–Ω–µ –Ω–∞—á–∞–ª—å–Ω–∏–∫",1,1
31,"–ê —ç—Ç–æ –Ω–µ –≤ –°–®–ê –≤—Å–µ ""–≤–∞–∫—Ü–∏–Ω–∏—Ä–æ–≤–∞–Ω—ã"" –ø–æ —Å–∞–º–æ–µ –Ω–µ —Ö–æ—á—É?",1,1


In [32]:
clusters_names = ['DBSCAN', 'Kmeans']
clusters_numbers = [0, 0]
cluster_similarity(cluster_difference)(comments_labeled, clusters_names, clusters_numbers)
intersection, diff0, diff1 = cluster_difference(comments_labeled, clusters_names, clusters_numbers)
comments_labeled.iloc[diff1][:20]

intersection: 0; DBSCAN - Kmeans = 1002; Kmeans - DBSCAN = 1700


Unnamed: 0,text,DBSCAN,Kmeans
3,"–õ–µ–æ–Ω–∏–¥–∞ –ö—É—Ä–∞–≤–ª—ë–≤–∞ –≥–æ—Å–ø–∏—Ç–∞–ª–∏–∑–∏—Ä–æ–≤–∞–ª–∏ 5 —è–Ω–≤–∞—Ä—è –≤ –≤–∏—Ä—É—Å—è—Ç–Ω–∏–∫ –ü—Ä–æ—Ü–µ–Ω–∫–æ. –õ–µ–ø–∏–ª—ã-—É–±–∏–π—Ü—ã —Å–æ–æ–±—â–∞—é—Ç, —à–æ –∞—Ä—Ç–∏—Å—Ç –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –≤ –æ—Ç–¥–µ–ª–µ–Ω–∏–∏ —Ä–µ–∞–Ω–∏–º–∞—Ü–∏–∏ –≤ —Ç—è–∂–µ–ª–æ–º —Å–æ—Å—Ç–æ—è–Ω–∏–∏, –∫–æ—Ç–æ—Ä–æ–µ –Ω–µ –º–µ–Ω—è–µ—Ç—Å—è —É–∂–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ –¥–Ω–µ–π.",1,0
5,"‚úπ –í –°–®–ê –Ω–æ–≤—ã–π –∞–Ω—Ç–∏—Ä–µ–∫–æ—Ä–¥ ‚Äî 1,13 –º–∏–ª–ª–∏–æ–Ω–∞ –Ω–æ–≤—ã—Ö –∑–∞—Ä–∞–∂–µ–Ω–∏–π –∑–∞ —Å—É—Ç–∫–∏; –≠—Ç–æ –µ—â—ë –Ω–µ –ø—Ä–µ–¥–µ–ª.",-1,0
7,"–í–∏—Ä—É—Å–æ–ª–æ–≥: –ì–∏–±—Ä–∏–¥–Ω–æ–≥–æ –≤–∞—Ä–∏–∞–Ω—Ç–∞ –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ ¬´–¥–µ–ª—å—Ç–∞–∫—Ä–æ–Ω¬ª –Ω–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç, –æ–Ω –Ω–µ —Å–º–æ–∂–µ—Ç —Ç–µ–±–µ –Ω–∞–≤—Ä–µ–¥–∏—Ç—å –ì–∏–±—Ä–∏–¥–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ ¬´–¥–µ–ª—å—Ç–∞–∫—Ä–æ–Ω¬ª:",1,0
18,"–õ–µ–æ–Ω–∏–¥–∞ –ö—É—Ä–∞–≤–ª—ë–≤–∞ –≥–æ—Å–ø–∏—Ç–∞–ª–∏–∑–∏—Ä–æ–≤–∞–ª–∏ 5 —è–Ω–≤–∞—Ä—è –≤ –≤–∏—Ä—É—Å—è—Ç–Ω–∏–∫ –ü—Ä–æ—Ü–µ–Ω–∫–æ. –õ–µ–ø–∏–ª—ã-—É–±–∏–π—Ü—ã —Å–æ–æ–±—â–∞—é—Ç, —à–æ –∞—Ä—Ç–∏—Å—Ç –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –≤ –æ—Ç–¥–µ–ª–µ–Ω–∏–∏ —Ä–µ–∞–Ω–∏–º–∞—Ü–∏–∏ –≤ —Ç—è–∂–µ–ª–æ–º —Å–æ—Å—Ç–æ—è–Ω–∏–∏, –∫–æ—Ç–æ—Ä–æ–µ –Ω–µ –º–µ–Ω—è–µ—Ç—Å—è —É–∂–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ –¥–Ω–µ–π.",1,0
24,–í—Å–µ–º —É–∂–µ –¥–∞–≤–Ω–æ –Ω–∞–ø–ª–µ–≤–∞—Ç—å –∞–¥–µ–∫–≤–∞—Ç–Ω—ã–º!!!!!! –ö–æ—Ä–æ–Ω—Ä—à–∏–∑—ã —É—Å–ø–æ–∫–æ–π—Ç–µ—Å—å –≤—ã –Ω–µ –ø–æ–±–µ–¥–∏—Ç–µ!!!!!!!,1,0
25,"–õ–æ–ª, –ø—Ä–∏–µ—Ö–∞–ª–∏ –≤—Å–µ... –©–∞—Å –¥–æ 6–∫ –ø–æ–¥–Ω–∏–º–µ—Ç—Å—è –∏ –∑–∞–≥–æ–≤–æ—Ä—è—Ç –æ–ø—è—Ç—å –ø—Ä–æ –Ω–µ—Ä–∞–±–æ—á–∏–µ –¥–Ω–∏",1,0
26,"–ü—Ä–∏–∫–æ–ª –±—É–¥–µ—Ç, –∫–æ–≥–¥–∞ –±—É–¥–µ—Ç –ø–æ–¥–≥–æ–ª–æ–≤–Ω–∞—è 100% –≤–∞–∫—Ü–∏–Ω–∞—Ü–∏—è –∏ –≤—Å–µ –±—É–¥—É—Ç –≤—Å—ë —Ä–∞–≤–Ω–æ –¥–æ—Ö–Ω—É—Ç—å –∏ –æ—Ç –∫–æ—Ä–æ–Ω—ã –∏ –æ—Ç –ø–æ—Å–ª–µ–¥—Å—Ç–≤–∏–π –ø—Ä–∏–≤–∏–≤–∫–∏: –∏–Ω—Å—É–ª—å—Ç—ã, –∏–Ω—Ñ–∞—Ä–∫—Ç—ã, —Ç—Ä–æ–º–±—ã –∏ —Ç–¥. –ò —ç—Ç–∏ —Ç–∞–∫–∏–µ: –ú—ã –æ—Ñ—ã–±–ª–∏—Ñ—å",-1,0
27,"–ö–æ–º–ø–∞–Ω–∏—è –≤—ã–ø—É—Å–∫–∞—é—â–∞—è –ø—Ä–∏–≤–∏–≤–∫—É, –∑–∞ –ø–æ–±–æ—á–Ω—ã–µ —ç—Ñ—Ñ–µ–∫—Ç—ã - –æ—Ç–≤–µ—Ç—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ –Ω–µ –Ω–µ—Å–µ—Ç –£—á—Ä–µ–∂–¥–µ–Ω–∏–µ, –≤ –∫–æ—Ç–æ—Ä–æ–º —Ç–µ–±–µ —Å—Ç–∞–≤—è—Ç —ç—Ç—É –ø—Ä–∏–≤–∏–≤–∫—É - –æ—Ç–≤–µ—Ç—Å—Ç–≤–µ–Ω–Ω–æ—Å—Ç–∏ –Ω–µ –Ω–µ—Å–µ—Ç –¢—ã, –Ω–µ –≤–∞–∫—Ü–∏–Ω–∏—Ä–æ–≤–∞–Ω - –±–µ–∑–æ—Ç–≤–µ—Å—Ç–≤–µ–Ω–Ω—ã–π",1,0
28,"‚úπ –í –°–®–ê –Ω–æ–≤—ã–π –∞–Ω—Ç–∏—Ä–µ–∫–æ—Ä–¥ ‚Äî 1,13 –º–∏–ª–ª–∏–æ–Ω–∞ –Ω–æ–≤—ã—Ö –∑–∞—Ä–∞–∂–µ–Ω–∏–π –∑–∞ —Å—É—Ç–∫–∏; –≠—Ç–æ –µ—â—ë –Ω–µ –ø—Ä–µ–¥–µ–ª.",-1,0
30,"–í–∏—Ä—É—Å–æ–ª–æ–≥: –ì–∏–±—Ä–∏–¥–Ω–æ–≥–æ –≤–∞—Ä–∏–∞–Ω—Ç–∞ –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ ¬´–¥–µ–ª—å—Ç–∞–∫—Ä–æ–Ω¬ª –Ω–µ —Å—É—â–µ—Å—Ç–≤—É–µ—Ç, –æ–Ω –Ω–µ —Å–º–æ–∂–µ—Ç —Ç–µ–±–µ –Ω–∞–≤—Ä–µ–¥–∏—Ç—å –ì–∏–±—Ä–∏–¥–Ω—ã–π –≤–∞—Ä–∏–∞–Ω—Ç –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ ¬´–¥–µ–ª—å—Ç–∞–∫—Ä–æ–Ω¬ª:",1,0
