In [58]:
import os
import math
import openai
import json
from Keys import openai_keys
import re
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
import joblib
import numpy as np
import pandas as pd
import random
from keybert import KeyBERT
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from collections import Counter
import plotly.express as px
import plotly.offline as pyo
import tensorflow as tf
from tensorflow.keras.models import load_model

openai.organization = openai_keys['organization']
openai.api_key = openai_keys['api_key']
embedding_model = "text-embedding-ada-002"


def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']


In [21]:
# Load the JSON data from the file
with open('Source/embeddings-youtube-markass_brownie.json', 'r') as json_file:
    embeddings = json.load(json_file)

In [22]:
clf = joblib.load('models/embeddingsForest_4.pkl')
NNmodel = load_model('models/embeddingsNN')

In [23]:
class Analysis:
    def __init__(self, title, text, embedding, sentiment=1, probas=[0, 0, 0]):
        self.title = title
        self.raw = text
        self.embedding = embedding
        self.sentiment = sentiment
        self.cluster = None
        self.probs = probas
        self.keywords = Counter()


class Cluster:
    def __init__(self, clusterIndex, element):
        self.clusterIndex = clusterIndex
        self.elements = [element]
        self.clusterName = ""
        self.keywords = Counter()
        self.clusterSize = (0, 0)
        self.clusterBuffer = element.embedding
        self.sentimentDistribution = np.array([1, 0, 0])
        self.sentimentDistribution[element.sentiment] += 1
        self.subClusters = dict()

    def calculateClusterSize(self):
        tmp = np.sum(np.square(self.clusterBuffer - np.mean(self.clusterBuffer, axis=0, keepdims=True)), axis=1)
        self.clusterSize = (np.mean(tmp).item(), np.max(tmp).item())

    def addPoint(self, point):
        self.elements.append(point)
        self.clusterBuffer = np.vstack((self.clusterBuffer, point.embedding))
        self.sentimentDistribution[point.sentiment] += 1

In [24]:
sentiments = list()
iter = 0
for analysis in embeddings:
    emb = np.array(analysis["embedding"]).reshape((1, -1))
    # y_pr = clf.predict_proba(emb)
    # y = clf.predict(emb).item()
    y_pr = NNmodel.predict(emb)
    y = np.argmax(y_pr)
    sentiments.append(Analysis(analysis["title"], analysis["text"], emb, sentiment=y, probas=y_pr))
    iter += 1
    print(str(iter) + "/" + str(len(embeddings)))

1/1466
2/1466
3/1466
4/1466
5/1466
6/1466
7/1466
8/1466
9/1466
10/1466
11/1466
12/1466
13/1466
14/1466
15/1466
16/1466
17/1466
18/1466
19/1466
20/1466
21/1466
22/1466
23/1466
24/1466
25/1466
26/1466
27/1466
28/1466
29/1466
30/1466
31/1466
32/1466
33/1466
34/1466
35/1466
36/1466
37/1466
38/1466
39/1466
40/1466
41/1466
42/1466
43/1466
44/1466
45/1466
46/1466
47/1466
48/1466
49/1466
50/1466
51/1466
52/1466
53/1466
54/1466
55/1466
56/1466
57/1466
58/1466
59/1466
60/1466
61/1466
62/1466
63/1466
64/1466
65/1466
66/1466
67/1466
68/1466
69/1466
70/1466
71/1466
72/1466
73/1466
74/1466
75/1466
76/1466
77/1466
78/1466
79/1466
80/1466
81/1466
82/1466
83/1466
84/1466
85/1466
86/1466
87/1466
88/1466
89/1466
90/1466
91/1466
92/1466
93/1466
94/1466
95/1466
96/1466
97/1466
98/1466
99/1466
100/1466
101/1466
102/1466
103/1466
104/1466
105/1466
106/1466
107/1466
108/1466
109/1466
110/1466
111/1466
112/1466
113/1466
114/1466
115/1466
116/1466
117/1466
118/1466
119/1466
120/1466
121/1466
122/1466
123/1466
1

In [116]:
X = np.zeros((len(sentiments), sentiments[0].embedding.shape[1]))
for i in range(len(sentiments)):
    X[i] = sentiments[i].embedding

clustering = DBSCAN(eps=0.46, min_samples=3).fit(X)
X_left = list()
mapping = dict()
clusters = dict()
for i in range(len(sentiments)):
    if clustering.labels_[i] == -1:
        X_left.append(sentiments[i].embedding)
        mapping[len(X_left) - 1] = i
    else:
        if clustering.labels_[i] in clusters:
            clusters[clustering.labels_[i]].addPoint(sentiments[i])
        else:
            clusters[clustering.labels_[i]] = Cluster(clustering.labels_[i], sentiments[i])
        sentiments[i].cluster = clusters[clustering.labels_[i]]

clusters_n = len(set(clustering.labels_)) - 1
clustering_left = KMeans(n_clusters=32, init='k-means++', tol=1e-7, max_iter=1000).fit(np.array(X_left).reshape((-1, sentiments[0].embedding.shape[1])))
for i in range(len(X_left)):
    if clustering_left.labels_[i] + clusters_n in clusters:
        clusters[clustering_left.labels_[i] + clusters_n].addPoint(sentiments[mapping[i]])
    else:
        clusters[clustering_left.labels_[i] + clusters_n] = Cluster(clustering_left.labels_[i] + clusters_n, sentiments[mapping[i]])
    sentiments[mapping[i]].cluster = clusters[clustering_left.labels_[i] + clusters_n]


# clustering = DBSCAN(eps=0.12, min_samples=3, metric='cosine').fit(X)
# clustering = KMeans(n_clusters=32, init='k-means++',tol=1e-7, max_iter=1000, ).fit(X)
# clustering = GaussianMixture(n_components=128, n_init=1).fit(X)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.



In [95]:
clusters = dict()
# labels_ = clustering.predict(X)
for i in range(len(sentiments)):
    # sentiments[i].cluster = labels_[i]
    if clustering.labels_[i] in clusters:
        clusters[clustering.labels_[i]].addPoint(sentiments[i])
    else:
        clusters[clustering.labels_[i]] = Cluster(clustering.labels_[i], sentiments[i])
    sentiments[i].cluster = clusters[clustering.labels_[i]]

for clust in clusters.values():
    clust.calculateClusterSize()
    clusterAmount = int(
        np.exp(9 * clust.clusterSize[0]) * int(len(clust.elements) / 64))
    # if clusterAmount > 1:
    #     X_ = np.zeros((len(clust.elements), clust.elements[0].embedding.shape[1]))
    #     for i in range(len(clust.elements)):
    #         X_[i] = clust.elements[i].embedding
    #     subclustering = KMeans(n_clusters=clusterAmount, init='random').fit(X_)
    #
    #     for i in range(len(clust.elements)):
    #         if subclustering.labels_[i] in clust.subClusters:
    #             clust.subClusters[subclustering.labels_[i]].addPoint(clust.elements[i])
    #         else:
    #             clust.subClusters[subclustering.labels_[i]] = Cluster(subclustering.labels_[i], clust.elements[i])
    #
    #         clust.elements[i].cluster = clust.subClusters[subclustering.labels_[i]]
    #     for subclust in clust.subClusters.values():
    #         subclust.calculateClusterSize()

In [117]:
iter = 0
keywordsSet = set()
kw_model = KeyBERT(model='all-mpnet-base-v2')
for cluster in clusters.values():
    # prompts = [{"role": "system", "content":
    #     'Given several comments from a single cluster, name that cluster. Two individual comments are separated by three new lines. Cluster name should summarize the common idea of comments in a short phrase. The opinion of comments should be clear from the cluster name. Write the name only.'}]
    # delimeter = "\n\n\n"
    # prompt = list()
    # for _ in range(8):
    #     i = random.randint(0, len(cluster.elements) - 1)
    #     if len(str(cluster.elements[i].raw)) < 5000:
    #         prompt.append(str(cluster.elements[i].raw))
    # prompt = delimeter.join(prompt)
    # prompts.append({"role": "user", "content": prompt})
    # chat = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=prompts)
    # cluster.clusterName = chat.choices[0].message.content

    # txt = " ; ".join(list(set([cluster.elements[i].title for i in range(1, len(cluster.elements))]))) + " ; ".join([cluster.elements[i].raw for i in range(1, len(cluster.elements))])
    # keywords = kw_model.extract_keywords(txt, top_n=5, use_maxsum=True)
    # keywords.extend(kw_model.extract_keywords(txt, keyphrase_ngram_range=(2, 2), top_n=3, use_maxsum=True, exc))
    keywords = Counter()
    clusterPosts = set()
    for element in cluster.elements:
        kwords1 = [i[0] for i in kw_model.extract_keywords(element.raw, top_n=5, use_maxsum=True)]
        kwords2 = [i[0] for i in
                   kw_model.extract_keywords(element.raw, keyphrase_ngram_range=(2, 2), top_n=2, use_maxsum=True)]
        keywords.update(kwords1)
        keywords.update(kwords2)
        element.keywords = Counter(kwords1)
        clusterPosts.update([element.title])
    for title in clusterPosts:
        keywords.update([i[0] for i in kw_model.extract_keywords(title, top_n=3)])
        keywords.update([i[0] for i in kw_model.extract_keywords(title, keyphrase_ngram_range=(2, 2), top_n=1)])
    cluster.keywords = keywords

    for subclust in cluster.subClusters.values():
        keywords = Counter()
        for element in subclust.elements:
            keywords.update(element.keywords)
        subclust.keywords = keywords

    iter += 1
    print(str(iter) + "/" + str(len(clusters)))

KeyboardInterrupt: 

In [55]:
sentimentDictionary = {'positive': list(), 'negative': list(), 'neutral': list()}
for sent in sentiments:
    if sent.sentiment == 2 or (sent.probs[0, 0] - sent.probs[0, 2] <= 0.05):
        sentimentDictionary['positive'].append(sent)
    elif sent.sentiment == 1 or (
            -0.05 <= sent.probs[0, 0] - sent.probs[0, 1] <= 0.05 and -0.05 <= sent.probs[0, 2] - sent.probs[
        0, 1] <= 0.05):
        sentimentDictionary['neutral'].append(sent)
    elif sent.sentiment == 0:
        sentimentDictionary['negative'].append(sent)

In [44]:
relevantKeywords = Counter()
for sent in sentimentDictionary['positive']:
    relevantKeywords.update(sent.keywords)

# print(relevantKeywords['chatgpt'])
relevantKeywords = relevantKeywords.most_common()

In [56]:
sentimentClusters = {'positive': list(), 'negative': list(), 'neutral': list()}
for clust in clusters.values():
    if clust.sentimentDistribution[0] > clust.sentimentDistribution[1] and clust.sentimentDistribution[0] > \
            clust.sentimentDistribution[2]:
        sentimentClusters['negative'].append(clust)
    elif clust.sentimentDistribution[1] > clust.sentimentDistribution[0] and clust.sentimentDistribution[1] > \
            clust.sentimentDistribution[2]:
        sentimentClusters['neutral'].append(clust)
    else:
        sentimentClusters['positive'].append(clust)

In [None]:
clusters_ = list(clusters.values())
clusters_.sort(key=lambda x: -len(x.elements))
names = [i.clusterName for i in clusters_]
sizes = [len(i.elements) for i in clusters_]
plt.figure(figsize=(20, 10))  # Set the figure size

# Create the pie chart
palette_color = (sns.blend_palette(["#ffbb55", "#a14819", "#863f19"], len(clusters_)))
# random.shuffle(palette_color)
# plotting data on chart
#explode= list(np.array(list(reversed(range(0, 5)))) / 100) + [0]*(len(sizes)-5)

plt.pie(sizes, labels=None, colors=palette_color, autopct=lambda x: (str(round(x, 1)) + "%") if x > 3 else "",
        startangle=90, counterclock=False, textprops={'fontsize': 16, 'fontweight': 'bold'},
        wedgeprops={'edgecolor': 'black', 'linestyle': '-'}, radius=1, center=(0, 0))

# Create a custom legend-like structure for sector names
legend_labels = [f"{round(100 * size / sum(sizes), 1)}%: {label}" for label, size in zip(names, sizes)]
plt.legend(legend_labels, loc="center left", bbox_to_anchor=(1, 0.5))

plt.savefig("Product/chatGPT/plot.png")
# displaying chart
plt.show()

In [28]:
tsne = TSNE(n_components=2, random_state=69)
X_tsne = tsne.fit_transform(X)

In [118]:
Hover_Info = [0] * len(sentiments)
for i in range(len(sentiments)):
    # Hover_Info[i] = sentiments[i].raw + "\n" + "[" +sentiments[i].title + "]"
    Hover_Info[i] = sentiments[i].raw
Y_packed = [0] * len(sentiments)

for i in range(len(sentiments)):
    Y_packed[i] = sentiments[i].sentiment
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=Y_packed, hover_name=Hover_Info)
fig.update_layout(
    title="t-SNE of Sentiments",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
pyo.plot(fig, filename='Product/Iphone15/Sentiments_markass.html', auto_open=False)

for i in range(len(sentiments)):
    Y_packed[i] = sentiments[i].cluster.clusterIndex
df = pd.DataFrame(np.hstack((X_tsne, np.array(Y_packed).reshape(-1, 1))), columns=['x', 'y', 'color'])
df['color'] = df['color'].astype('str')
# df = pd.DataFrame(X_tsne, columns=['x', 'y'])
colors_ = cm.nipy_spectral(np.linspace(0, 1, len(clusters)))
# Convert the colors from RGBA to a format accepted by Plotly
colors = ['rgb' + str(tuple(int(c * 255) for c in color[:-1])) for color in colors_]
fig = px.scatter(df, x='x', y='y', color='color', color_discrete_sequence=colors, hover_name=Hover_Info)
fig.update_layout(
    title="t-SNE of Clusters",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
pyo.plot(fig, filename='Product/Iphone15/Clusters_markass.html', auto_open=False)

'Product/Iphone15/Clusters_markass.html'