In [None]:
import json
import os

import matplotlib.pyplot as plt
import numpy as np
import openai
import pandas as pd
from datetime import datetime

openai.api_key = ""

ModuleNotFoundError: No module named 'openai'

In [None]:
def json_to_dataframe(json_file):
    data = []
    with open(json_file, 'r') as file:
        for line in file:
            data.append(eval(line))  # Evaluate each line as a Python dictionary

    df = pd.DataFrame(data)
    return df

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    if isinstance(text, str):  # If text is a string, replace "\n" with space
        text = text.replace("\n", " ")
    elif isinstance(text, list):  # If text is a list, handle each element separately
        text = [x.replace("\n", " ") for x in text]

    return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']



def embed_summaries(df: pd.DataFrame, column_name: str) -> pd.Series:
    embeddings_series = df[column_name].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    return embeddings_series

In [None]:
data_path = "/Users/maglionejuanmartin/code/reviews_clusterization_generative/data/Cell_Phones_and_Accessories_5 copy.json"

In [None]:
df = json_to_dataframe(data_path)
df

In [None]:
df["review_embeddings"] = embed_summaries(df,"reviewText")

## K Means

In [None]:
df

In [None]:
print(df["review_embeddings"])

In [None]:
matrix = np.vstack(df.review_embeddings.values)
matrix.shape

## Clustering

In [None]:
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans

### Elbow Visualizer

In [None]:
km = KMeans(random_state=42)
visualizer = KElbowVisualizer(km, k=(2, 10))
visualizer.fit(matrix)
visualizer.show()

### Silhouette Visualuzer

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(9, 2, figsize=(30, 60))
for i in range(2, 20):
    '''
    Create KMeans instances for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q - 1][mod])
    visualizer.fit(matrix)
    print("i value:", i, "Score:", visualizer.silhouette_score_ * 100)

In [None]:
# from sklearn.cluster import KMeans
n_cluster = 7
kmeans = KMeans(n_clusters=n_cluster, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df["Cluster"] = labels

In [None]:
import openai

# Set the number of reviews per cluster
rev_per_cluster = 5

# Loop over each cluster
for i in range(n_cluster):
    # Pull out a sample of reviews from each cluster
    reviews = "\n".join(
        df[df.Cluster == i]
        .reviewText
        .sample(rev_per_cluster, random_state=42)
        .values
    )

    # Create a prompt for OpenAI to describe what's common in the reviews
    response = openai.ChatCompletion.create(
        model="gpt-4",   # update the model as per requirements
        messages=[
            {
                "role": "user",
                "content": f'What do the following customer reviews have in common? Give me the following: CATEGORY NAME and CATEGORY_DESCRIPTION about them\n\nCustomer reviews:\n"""\n{reviews}\n"""\n\nCATEGORY_NAME:\n\nCATEGORY_DESCRIPTION'
            }
        ],
        temperature=0,
        max_tokens=64,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    # Fetch the generated response, assuming response would be the 'Category'
    category = response['choices'][0]['message']['content']

    sample_cluster_rows = df[df.Cluster == i].sample(rev_per_cluster, random_state=42)

    # Concatenate cluster's common details
    common_Details = " $ ".join([str(values) for values in sample_cluster_rows.reviewText.values])

    # Concatenate the cluster title, category name | If example reviews wanted to be shown, add {common_details}
    print(f"Cluster {i} | {category}")

    # Print a separator for each cluster
    print("-" * 100)

## Plotting

In [None]:
# Import necessary modules
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Initialize the t-SNE model
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init="random", learning_rate=200)

# Fit and transform the data using t-SNE algorithm, reduce the dimensionality to 2
vis_dims2 = tsne.fit_transform(matrix)

# Extract x and y coordinates of each point in the transformed 2D data
x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

# For each category (or cluster), plot the points and the cluster centroid
for category, color in enumerate(["purple", "green", "red", "blue", "orange"]):
    # Get x and y coordinates of points of current cluster
    xs = np.array(x)[df.Cluster == category]
    ys = np.array(y)[df.Cluster == category]

    # Plot the points of the current cluster in the correlated color with some transparency
    plt.scatter(xs, ys, color=color, alpha=0.3)

    # Calculate the average x and y coordinates to place the centroid
    avg_x = xs.mean()
    avg_y = ys.mean()

    # Mark the cluster centroid with a cross marker
    plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)

# Add title to the plot
plt.title("Clusters identified visualized in language 2d using t-SNE")

## Exporting to xlsx

In [None]:
df = df.replace({r'\x1a': ''}, regex=True)

output_dir = 'output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")

filename = f"{output_dir}/output_{current_time}.xlsx"

df.to_excel(filename)