In [None]:
import pandas as pd
import jieba
from tqdm.auto import tqdm

import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans

import plotly.express as px

In [None]:
class VeryImportantTalk(object):
    ''' The Analysis of the VeryImportantTalk '''
    def __init__(self, path='VeryImportantTalk.txt', encoding='utf-8'):
        ''' Initialization '''
        content = open(path, 'r', encoding=encoding).read()
        df = pd.DataFrame(columns=['Sentence', 'Cuts'])
        self.content = content
        self.df = df
        
        self.get_sentences()
        self.cut_sentences()

    def get_sentences(self, raw=None):
        ''' Get Sentences from Raw Content [raw]'''
        if raw is None:
            raw = self.content
            
        for e in ['。', '！', 'a，']:
            raw = raw.replace(e, '\n')
        sentences = raw.split()
        
        self.df['Sentence'] = sentences
        
        return sentences

    def cut_sentences(self, sentences=None):
        ''' Cut every Sentence in the List of Sentences [sentences] '''
        if sentences is None:
            sentences = self.df['Sentence'].values
            
        cuts = []
        for s in tqdm(sentences):
            # Cut the Sentence,
            # and Discard the one-char Words
            tmp = [e for e in jieba.lcut(s) if len(e) > 1]

            # Append the Words,
            # and Discard the one-word Sentence
            if len(tmp) > 1:
                cuts.append(tmp)
            else:
                cuts.append([])
                
        self.df['Cuts'] = cuts
        
        return cuts
    

In [None]:
vit = VeryImportantTalk()
vit.df

In [None]:
df = vit.df.copy()
df

In [None]:
details = dict()

for idx in tqdm(df.index):
    for word in df.loc[idx, 'Cuts']:
        if word not in details:
            details[word] = [idx]
        else:
            details[word].append(idx)
            
details = pd.DataFrame(dict(Belong=details))
details['Count'] = details['Belong'].map(len)
details.sort_values(by='Count', ascending=False, inplace=True)

details[details.index] = 0

details

In [None]:
for w in tqdm(details.index):
    for j in details.loc[w, 'Belong']:
        details.loc[w, df.loc[j, 'Cuts']] += 1

details

In [None]:
X = details[details.index].to_numpy()
a = 1
X = (X - np.mean(X, axis=a, keepdims=True)) / np.std(X, axis=a, keepdims=True)
X = X / np.max(X, axis=a, keepdims=True)
# for j in range(len(details)):
#     X[j] = X[j] / details.iloc[j]['Count']
X = X.transpose()

X_embedded = TSNE(n_components=2).fit_transform(X)
print(X_embedded.shape)

fig = px.scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], title='Manifolder of Words')
fig.show()

In [None]:
kmeans = KMeans(n_clusters=7, random_state=0)
labels = kmeans.fit_predict(X_embedded)

colors = []
for e in labels:
    words = details.index[labels == e]
    colors.append(words[:10])

fig = px.scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], color=labels, hover_name=colors, title='Cluster of Words')
fig.show()

In [None]:
gantt = [] # = pd.DataFrame()

for word, label in tqdm(zip(details.index, labels)):
    idxs = details.loc[word, 'Belong']
    for i in idxs:
        gantt.append({'Sentence': i, 'Label': label, 'Word': word})

gantt = pd.DataFrame(gantt)
gantt

In [None]:
px.scatter(gantt, x='Sentence', y='Word', color='Label', title='Word Distribution')

In [None]:
px.scatter(gantt, x='Sentence', y='Label', color='Label', title='Word Gantt')

In [None]:
for l in np.unique(labels):
    _df = gantt.query(f"Label=={l}")
    title = f'{l}: ' + ','.join(_df['Word'].unique())
    print(title)
    fig = px.scatter(_df, x='Sentence', y='Word', hover_name='Word', hover_data=['Word', 'Label'], title=title)
    fig.show()