In [1]:
import pandas as pd
import numpy as np

from plotly.offline import download_plotlyjs, init_notebook_mode,  iplot, plot
init_notebook_mode(connected=True)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import re

import plotly.graph_objs as go

In [2]:
Summary_df = pd.read_csv('Summary.csv')

In [3]:
texts_list = np.char.lower(list(Summary_df.Summary))

In [4]:
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
cleaned_text = []
for text in texts_list:
    for i in symbols:
        text = text.replace(i, ' ')
    cleaned_text.append(text)

In [5]:
num_clusters = 3
max_iterations = 500
cmap = {
    0: 'blue', 1: 'red', 2: 'green'
}
num_pc = 3

In [6]:
tf_idf = TfidfVectorizer(analyzer="word", use_idf=True, smooth_idf=True, ngram_range=(1, 3))
tf_idf_data = tf_idf.fit_transform(cleaned_text)

In [7]:
k_kmeans = KMeans(
    n_clusters=num_clusters,
    max_iter=max_iterations,
    precompute_distances="auto",
    n_jobs=-1,
    random_state = 42,
    algorithm = "full"
)

In [8]:
labels = k_kmeans.fit_predict(tf_idf_data)

data = tf_idf_data.todense()


In [9]:
pc = PCA(n_components=num_pc).fit_transform(data)

In [10]:
x = list()
y = list()
z = list()
c = list()

for index, instance in enumerate(pc):
    pca_comp_1, pca_comp_2, pca_comp_3 = pc[index]
    x.append(pca_comp_1)
    y.append(pca_comp_2)
    z.append(pca_comp_3)
    c.append(cmap[labels[index]])

trace1 = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    mode='markers',
    marker=dict(
        size=12,
        color=c,
        opacity=0.65
    )
)

data = [trace1]
layout = go.Layout(
    title='Scatter plot between principal components'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)