In [1]:
import os
import math
import time

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import MarianMTModel, MarianTokenizer
import openai
import json
from Keys import openai_keys
import re
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
import joblib
import numpy as np
import pandas as pd
import random
from keybert import KeyBERT
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from collections import Counter
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow.keras.models import load_model

openai.organization = openai_keys['organization']
openai.api_key = openai_keys['api_key']
embedding_model = "text-embedding-ada-002"


def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']


In [2]:
# Load the JSON data from the file
with open('Source/embeddings-timur.json', 'r') as json_file:
    embeddings = json.load(json_file)

In [3]:
clf = joblib.load('models/embeddingsForest_4.pkl')
NNmodel = load_model('models/embeddingsNN.h5')

In [4]:
class Analysis:
    def __init__(self, title, text, embedding, sentiment=1, probas=[0, 0, 0]):
        self.title = title
        self.raw = text
        self.embedding = embedding
        self.sentiment = sentiment
        self.cluster = None
        self.probs = probas
        self.tsne = None


class Cluster:
    def __init__(self, clusterIndex, element):
        self.clusterIndex = clusterIndex
        self.elements = [element]
        self.clusterName = ""
        self.keywords = Counter()
        self.clusterSize = (0, 0)
        self.clusterBuffer = element.embedding
        self.sentimentDistribution = np.array([1, 0, 0])
        self.sentimentDistribution[element.sentiment] += 1
        self.subClusters = dict()

    def calculateClusterSize(self):
        tmp = np.sum(np.square(self.clusterBuffer - np.mean(self.clusterBuffer, axis=0, keepdims=True)), axis=1)
        self.clusterSize = (np.mean(tmp).item(), np.max(tmp).item())
        return self.clusterSize

    def addPoint(self, point):
        self.elements.append(point)
        self.clusterBuffer = np.vstack((self.clusterBuffer, point.embedding))
        self.sentimentDistribution[point.sentiment] += 1

In [5]:
sentiments = list()
iter = 0
X = list()
for analysis in embeddings:
    emb = np.array(analysis["embedding"])
    # y_pr = clf.predict_proba(emb)
    # y = clf.predict(emb).item()
    X.append(emb)
    sentiments.append(Analysis(analysis["title"], analysis["text"], emb))
X = np.array(X)
y_pr = NNmodel.predict(X)
y = np.argmax(y_pr, axis=1, keepdims=True)
for i in range(y.shape[0]):
    sentiments[i].sentiment = y[i].item()
    sentiments[i].probs = y_pr[i]





In [6]:
clustering = DBSCAN(eps=0.46, min_samples=3).fit(X)
X_left = list()
mapping = dict()
clusters = dict()
for i in range(len(sentiments)):
    if clustering.labels_[i] == -1:
        X_left.append(sentiments[i].embedding)
        mapping[len(X_left) - 1] = i
    else:
        if clustering.labels_[i] in clusters:
            clusters[clustering.labels_[i]].addPoint(sentiments[i])
        else:
            clusters[clustering.labels_[i]] = Cluster(clustering.labels_[i], sentiments[i])
        sentiments[i].cluster = clusters[clustering.labels_[i]]

# initialClusters = clusters.copy()
clusters_n = len(set(clustering.labels_)) - 1
# bestClustering = None
# bestClusteringSize = np.inf
# for trial in range(10):
# clusters = initialClusters.copy()
clustering_left = KMeans(n_clusters=24, init='k-means++', tol=1e-7, max_iter=1000).fit(
    np.array(X_left).reshape((-1, len(sentiments[0].embedding))))
for i in range(len(X_left)):
    if clustering_left.labels_[i] + clusters_n in clusters:
        clusters[clustering_left.labels_[i] + clusters_n].addPoint(sentiments[mapping[i]])
    else:
        clusters[clustering_left.labels_[i] + clusters_n] = Cluster(clustering_left.labels_[i] + clusters_n,
                                                                    sentiments[mapping[i]])
    sentiments[mapping[i]].cluster = clusters[clustering_left.labels_[i] + clusters_n]

for clust in clusters.values():
    clust.calculateClusterSize()



Could not find the number of physical cores for the following reason:
found 0 physical cores < 1

  File "C:\Users\Admin\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(

KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=6.



In [32]:
sentimentDictionary = {'positive': list(), 'negative': list(), 'neutral': list()}
for sent in sentiments:
    if sent.sentiment == 2 or (sent.probs[0] - sent.probs[2] <= 0.05):
        sentimentDictionary['positive'].append(sent)
    elif sent.sentiment == 1 or (
            -0.05 <= sent.probs[0] - sent.probs[1] <= 0.05 and -0.05 <= sent.probs[2] - sent.probs[1] <= 0.05):
        sentimentDictionary['neutral'].append(sent)
    elif sent.sentiment == 0:
        sentimentDictionary['negative'].append(sent)

In [30]:
sentimentClusters = {'positive': list(), 'negative': list(), 'neutral': list()}
for clust in clusters.values():
    if clust.sentimentDistribution[0] > clust.sentimentDistribution[1] and clust.sentimentDistribution[0] > \
            clust.sentimentDistribution[2]:
        sentimentClusters['negative'].append(clust)
    elif clust.sentimentDistribution[1] > clust.sentimentDistribution[0] and clust.sentimentDistribution[1] > \
            clust.sentimentDistribution[2]:
        sentimentClusters['neutral'].append(clust)
    else:
        sentimentClusters['positive'].append(clust)

In [7]:
tsne = TSNE(n_components=2, random_state=69)
X_tsne = tsne.fit_transform(X)
for i in range(len(sentiments)):
    sentiments[i].tsne = X_tsne[i]

In [29]:
def insert_line_breaks(text, max_length=75):
    words = text.split()
    lines = []
    current_line = ''
    for word in words:
        if len(current_line + word) <= max_length:
            current_line += ' ' + word
        else:
            lines.append(current_line)
            current_line = word
    lines.append(current_line)
    return '<br>'.join(lines).strip()

def cap_str(text, cap_len=75):
    if len(text) < cap_len:
        return text
    else:
        return text[:cap_len] + '...'

Hover_Info = [0] * len(sentiments)
for i in range(len(sentiments)):
    Hover_Info[i] = "[" + sentiments[i].title + "]<br>" + insert_line_breaks(sentiments[i].raw)
    # Hover_Info[i] = sentiments[i].raw

sent_packed = [0] * len(sentiments)
for i in range(len(sentiments)):
    sent_packed[i] = sentiments[i].sentiment

clust_packed = [0] * len(sentiments)
for i in range(len(sentiments)):
    clust_packed[i] = sentiments[i].cluster.clusterIndex

df = pd.DataFrame(np.hstack((X_tsne, np.array(sent_packed).reshape(-1, 1), np.array(clust_packed).reshape(-1, 1))),
                  columns=['x', 'y', 'sentiment', 'cluster'])
df['sentiment'] = df['sentiment'].astype('str')
df['cluster'] = df['cluster'].astype('str')

capSize = list(reversed(sorted([len(clust.elements) for clust in clusters.values()])))[25]
annotations = []
for clust in clusters.values():
    if len(clust.elements) < capSize:
        continue

    pos = np.mean(np.array([elem.tsne for elem in clust.elements]), axis=0, keepdims=True)
    annotations.append(go.layout.Annotation(x=pos[0, 0], y=pos[0,1], text=insert_line_breaks(cap_str(clust.clusterName, cap_len=160)), showarrow=False, ax=0, ay=0, bgcolor='rgba(0, 0, 0, 0.5)', borderpad=0,
                                            font=dict(
                                            family="Arial, sans-serif",  # You can choose your font family here
                                            size=11,  # Set the font size here
                                            color='white'
                                        )))

fig = px.scatter(df, x='x', y='y', color='sentiment', hover_name=Hover_Info,
                 color_discrete_sequence=["purple", "gray", "orange"])
fig.update_traces(marker={'size': 9})
fig.update_layout(
    # title="t-SNE of Sentiments",
    xaxis_title="",
    yaxis_title="",
    template='plotly_white',
    hoverlabel=dict(font_size=15),
    margin=dict(l=0, r=0, t=0, b=0),
    annotations=annotations
)
fig.update_traces(marker=dict(showscale=False))
fig.update_layout(showlegend=False)
pyo.plot(fig, filename='Product/Timur/Sentiments.html', auto_open=False)

# df = pd.DataFrame(X_tsne, columns=['x', 'y'])
colors_ = cm.nipy_spectral(np.linspace(0, 1, len(clusters)))
# Convert the colors from RGBA to a format accepted by Plotly
colors = ['rgb' + str(tuple(int(c * 255) for c in color[:-1])) for color in colors_]
fig = px.scatter(df, x='x', y='y', color='cluster', color_discrete_sequence=colors, hover_name=Hover_Info, )
fig.update_traces(marker={'size': 9})
fig.update_layout(
    # title="t-SNE of Clusters",
    xaxis_title="",
    yaxis_title="",
    template='plotly_white',
    hoverlabel=dict(font_size=15),
    margin=dict(l=0, r=0, t=0, b=0),
    annotations=annotations
)
fig.update_traces(marker=dict(showscale=False))
fig.update_layout(showlegend=False)
pyo.plot(fig, filename='Product/Timur/Clusters.html', auto_open=False)

'Product/Timur/Clusters.html'

In [11]:

summary_model = T5ForConditionalGeneration.from_pretrained('t5-small').to('cuda')
summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ru-en').to('cuda')
translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ru-en')


for clust in clusters.values():
    # start_time = time.time()
    inp = []
    for elem in clust.elements:
        if (len(elem.raw) > 300 and len(clust.elements) > 6) or len(elem.raw) > 500:
            # inp.append(clust.elements[i].raw[:300])
            continue
        inp.append(elem.raw)
        if len(inp) >= 10:
            break
    inputs = translation_tokenizer('\n'.join(inp), return_tensors="pt", truncation=True).to('cuda')
    with torch.no_grad():
        translated = translation_model.generate(**inputs).cpu()

    translated_text = translation_tokenizer.decode(translated[0], skip_special_tokens=True)

    # input_ids = summary_tokenizer("summarize: " + '\n'.join(inp), return_tensors="pt", truncation=True).input_ids.to('cuda')
    input_ids = summary_tokenizer("summarize: " + translated_text, return_tensors="pt", truncation=True).input_ids.to('cuda')

    with torch.no_grad():
        summary_ids = summary_model.generate(
            input_ids,
            max_length=30,  # Maximum length of the generated summary
            min_length=15,  # Minimum length of the generated summary
            length_penalty=0.25,  # Higher values encourage longer summaries
            num_beams=2,  # Number of beams for beam search
            early_stopping=False,  # Stop when a good candidate is found even if not all beams are finished
            repetition_penalty=1,
            # temperature=0.9
        ).cpu()
    summary = summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True, max_words_length=35)
    clust.clusterName = summary

    # end_time = time.time()
    # time_taken = end_time - start_time
    # print(f'Time taken: {time_taken} seconds')


Recommended: pip install sacremoses.



In [None]:
model_name = 'Helsinki-NLP/opus-mt-ru-en'
translation_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ru-en')
translation_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ru-en')

# Your Russian text
text = 'I couldnt figure it out for a long time, and in eight minutes, things got back to normal.'

# Tokenize the text and translate
inputs = translation_tokenizer(text, return_tensors="pt", truncation=True).to('cuda')
translated = translation_model.generate(**inputs)

# Decode the translated text
translated_text = translation_tokenizer.decode(translated[0], skip_special_tokens=True)
print(translated_text)

In [9]:
json_clusters = []
for clust in clusters.values():
    json_elements = []
    for elem in clust.elements:
        json_elements.append({"text": elem.raw, "sentiment": elem.sentiment, "position": elem.tsne.tolist()})
    json_clusters.append({"cluster_name": clust.clusterName, "cluster_sentiment": clust.sentimentDistribution.tolist(), "elements": json_elements})

with open('Product/Temp/markus.json', 'w') as json_file:
    json.dump(json_clusters, json_file, indent=4)

In [17]:
torch.cuda.is_available()

True