In [None]:
import pandas as pd
import jieba
from tqdm.auto import tqdm

import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans

import plotly.express as px
import plotly.graph_objects as go

In [None]:
class VeryImportantTalk(object):
    ''' The Analysis of the VeryImportantTalk '''
    def __init__(self, path='VeryImportantTalk.txt', encoding='utf-8'):
        ''' Initialization '''
        content = open(path, 'r', encoding=encoding).read()
        df = pd.DataFrame(columns=['Sentence', 'Cuts'])
        self.content = content
        self.df = df
        
        self.get_sentences()
        self.cut_sentences()

    def get_sentences(self, raw=None):
        ''' Get Sentences from Raw Content [raw]'''
        if raw is None:
            raw = self.content
            
        for e in ['。', '！', 'a，']:
            raw = raw.replace(e, '\n')
        sentences = raw.split()
        
        self.df['Sentence'] = sentences
        
        return sentences

    def cut_sentences(self, sentences=None):
        ''' Cut every Sentence in the List of Sentences [sentences] '''
        if sentences is None:
            sentences = self.df['Sentence'].values
            
        cuts = []
        for s in tqdm(sentences):
            # Cut the Sentence,
            # and Discard the one-char Words
            tmp = [e for e in jieba.lcut(s) if len(e) > 1]

            # Append the Words,
            # and Discard the one-word Sentence
            if len(tmp) > 1:
                cuts.append(tmp)
            else:
                cuts.append([])
                
        self.df['Cuts'] = cuts
        
        return cuts
    

In [None]:
vit = VeryImportantTalk()
vit.df

In [None]:
df = vit.df.copy()
df

In [None]:
details = dict()

for idx in tqdm(df.index):
    for word in df.loc[idx, 'Cuts']:
        if word not in details:
            details[word] = [idx]
        else:
            details[word].append(idx)
            
details = pd.DataFrame(dict(Belong=details))
details['Count'] = details['Belong'].map(len)
details.sort_values(by='Count', ascending=False, inplace=True)

details[details.index] = 0

details

In [None]:
for w in tqdm(details.index):
    for j in details.loc[w, 'Belong']:
        details.loc[w, df.loc[j, 'Cuts']] += 1

details

In [None]:
X = details[details.index].to_numpy()
a = 1
X = (X - np.mean(X, axis=a, keepdims=True)) / np.std(X, axis=a, keepdims=True)
X = X / np.max(X, axis=a, keepdims=True)
# for j in range(len(details)):
#     X[j] = X[j] / details.iloc[j]['Count']
X = X.transpose()

X_embedded = TSNE(n_components=2).fit_transform(X)
print(X_embedded.shape)

In [None]:
n = len(X_embedded)
dist_matrix = []
for j in tqdm(range(n)):
    dist_matrix.append(np.linalg.norm(X_embedded - X_embedded[j, :], axis=1))
    dist_matrix[-1][j] = np.inf
    
dist_matrix = np.array(dist_matrix)
dist_matrix.shape

In [None]:
a = 0
b = np.argmin(dist_matrix[a])
route = [(a, b)]
inside = [a, b]
remain = [e for e in range(n) if e not in inside]
for _ in tqdm(range(n-2)):
    mat = dist_matrix[inside][:, remain]
    pos = np.unravel_index(np.argmin(mat), mat.shape)
    a = inside[pos[0]]
    b = remain[pos[1]]
    inside.append(b)
    remain.remove(b)
    route.append((a, b))
    
route

In [None]:
route_df = []
for j, r in tqdm(enumerate(route)):
    for i in [0, 1]:
        route_df.append(dict(
            x=X_embedded[r[i]][0],
            y=X_embedded[r[i]][1],
            group=j
        ))
        
route_df = pd.DataFrame(route_df)
route_df

In [None]:
fig1 = px.scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], title='Manifolder of Words')
fig1.show()
fig1.data

In [None]:
fig2 = px.line(route_df, x='x', y='y', color='group')
fig2.show()
fig2.data

In [None]:
fig = go.Figure()
fig.add_trace(fig1.data[0])
for d in fig2.data:
    d['line']['color'] = 'gray'
    d['showlegend'] = False
    fig.add_trace(d)
fig.update_layout({'title': 'Connection Graph'})
fig.show()

In [None]:
count = np.zeros(n)
for r in tqdm(route):
    count[r[0]] += 1
    count[r[1]] += 1
count

In [None]:
px.bar(sorted(count), title='Connection Degree')

In [None]:
words = details.index.values
words

groups = dict()
for m in sorted(np.unique(count), reverse=True):
    groups[m] = words[count == m]
    
M = max(groups)
M, groups

In [None]:
top_links = dict()

for k in [8, 5]:
    for g in groups[k]:
        i = int(np.argwhere(words == g))
        print(g, i)
        top_links[g] = []
        for r in route:
            if i in r:
                top_links[g].append(words[[e for e in r if e != i][0]])

top_links