<a href="https://colab.research.google.com/github/karnav-patel/google-colab/blob/main/PrivacyReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_samples, silhouette_score

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
def embed_text(text):
    return embed([text]).numpy()[0]

data = pd.read_csv('PrivacyReviews.csv')
data = data[data['category'] == 'Parenting']
review_embeddings = np.array([embed_text(review) for review in data['review']])

# Load dataset
# data = pd.read_csv('IMDB Dataset.csv')
# data = data[data['sentiment'] == 'positive']
# reviews = data['review']
# review_embeddings = np.array([embed_text(review) for review in reviews])



def k_means_plus_plus(k, embeddings):
    n_samples, _ = embeddings.shape
    centers = np.zeros((k, embeddings.shape[1]))
    index = np.random.randint(0, n_samples)
    centers[0] = embeddings[index]
    for i in range(1, k):
        distances = cosine_similarity(embeddings, centers[:i])
        min_distances = 1 - np.max(distances, axis=1)
        min_distances_squared = min_distances ** 2
        probabilities = min_distances_squared / np.sum(min_distances_squared)
        cumulative_probabilities = np.cumsum(probabilities)
        r = np.random.rand()
        for j, p in enumerate(cumulative_probabilities):
            if r < p:
                centers[i] = embeddings[j]
                break
    return centers

def assign_clusters(embeddings, centers):
    similarity = cosine_similarity(embeddings, centers)
    return np.argmax(similarity, axis=1)

def update_centers(k, clusters, embeddings):
    new_centers = np.zeros((k, embeddings.shape[1]))
    for i in range(k):
        members = embeddings[clusters == i]
        if len(members) > 0:
            new_centers[i] = np.mean(members, axis=0)
    return new_centers

def k_means_clustering(k, embeddings, max_iters=100):
    centers = k_means_plus_plus(k, embeddings)
    for iteration in range(max_iters):
        clusters = assign_clusters(embeddings, centers)
        new_centers = update_centers(k, clusters, embeddings)
        if np.allclose(centers, new_centers):
            break
        centers = new_centers
    return clusters, centers

best_k = 0
best_metric = -np.inf
best_clusters = None
best_centers = None

for k in range(2, 11):
    clusters, centers = k_means_clustering(k, review_embeddings)
    silhouette_avg = silhouette_score(review_embeddings, clusters, metric='cosine')
    print(k)
    print(silhouette_avg)
    print("===================")
    if np.unique(clusters).size > 1:
        dist_matrix = cosine_similarity(centers)
        np.fill_diagonal(dist_matrix, 0)
        distk = np.min(dist_matrix)
        Mk = (silhouette_avg > 0).sum()  # Assuming positive silhouette score implies a 'compact' cluster
        metric = distk * Mk
        if metric > best_metric:
            best_k = k
            best_metric = metric
            best_clusters = clusters
            best_centers = centers

if best_k > 0:
    data['cluster'] = best_clusters
    for cluster in range(best_k):
        cluster_reviews = data[data['cluster'] == cluster]['review']
        cluster_embeddings = review_embeddings[data['cluster'] == cluster]
        similarities = cosine_similarity(cluster_embeddings, [best_centers[cluster]])
        top_indices = np.argsort(similarities[:, 0])[::-1][:10]  # Sort by similarity and select top 10
        top_reviews = cluster_reviews.iloc[top_indices]
        print(f'\nCluster {cluster} - Top 10 reviews:')
        for review in top_reviews:
            print(review)
else:
    print("No effective clustering could be achieved.")


In [93]:
def embed_text(text):
    return embed([text]).numpy()[0]

data = pd.read_csv('PrivacyReviews.csv')
data = data[data['category'] == 'Parenting']
review_embeddings = np.array([embed_text(review) for review in data['review']])

# Load dataset
# data = pd.read_csv('IMDB Dataset.csv')
# data = data[data['sentiment'] == 'positive']
# reviews = data['review']
# review_embeddings = np.array([embed_text(review) for review in reviews])

# Custom k-means++ initialization using cosine similarity
def k_means_plus_plus(k, embeddings):
    n_samples, _ = embeddings.shape
    centers = np.zeros((k, embeddings.shape[1]))
    index = np.random.randint(0, n_samples)
    centers[0] = embeddings[index]
    for i in range(1, k):
        distances = cosine_similarity(embeddings, centers[:i])
        min_distances = 1 - np.max(distances, axis=1)
        probabilities = min_distances**2 / np.sum(min_distances**2)
        cumulative_probabilities = np.cumsum(probabilities)
        r = np.random.rand()
        for j, p in enumerate(cumulative_probabilities):
            if r < p:
                centers[i] = embeddings[j]
                break
    return centers

# Cluster assignment and center update functions
def assign_clusters(embeddings, centers):
    similarity = cosine_similarity(embeddings, centers)
    return np.argmax(similarity, axis=1)

def update_centers(k, clusters, embeddings):
    new_centers = np.zeros((k, embeddings.shape[1]))
    for i in range(k):
        members = embeddings[clusters == i]
        if len(members) > 0:
            new_centers[i] = np.mean(members, axis=0)
    return new_centers

# K-means clustering algorithm
def k_means_clustering(k, embeddings, max_iters=100):
    centers = k_means_plus_plus(k, embeddings)
    for iteration in range(max_iters):
        clusters = assign_clusters(embeddings, centers)
        new_centers = update_centers(k, clusters, embeddings)
        if np.allclose(centers, new_centers):
            break
        centers = new_centers
    return clusters, centers

# Find the best k using the summarization metric
best_k = 0
best_metric = -np.inf
best_clusters = None
best_centers = None

for k in range(2, 11):
    clusters, centers = k_means_clustering(k, review_embeddings)
    silhouette_values = silhouette_samples(review_embeddings, clusters, metric='cosine')
    silhouette_avg = silhouette_score(review_embeddings, clusters, metric='cosine')
    Mk = (silhouette_values > silhouette_avg).sum() / len(clusters) * 100
    dist_matrix = cosine_similarity(centers)
    np.fill_diagonal(dist_matrix, 0)
    distk = 1 - np.max(dist_matrix)
    metric = distk * Mk
    if metric > best_metric:
        best_k = k
        best_metric = metric
        best_clusters = clusters
        best_centers = centers

# Output results
if best_k > 0:
    data['cluster'] = best_clusters
    for cluster in range(best_k):
        cluster_reviews = data[data['cluster'] == cluster]['review']
        cluster_embeddings = review_embeddings[data['cluster'] == cluster]
        similarities = cosine_similarity(cluster_embeddings, [best_centers[cluster]])
        top_indices = np.argsort(similarities[:, 0])[::-1][:10]
        top_reviews = cluster_reviews.iloc[top_indices]
        print(f'\nCluster {cluster} - Top 10 reviews:')
        for review in top_reviews:
            print(review)
else:
    print("No effective clustering could be achieved.")


Cluster 0 - Top 10 reviews:
It makes it difficult to control what personal data is shared.
Uses data for purposes other than stated at the time of collection.

Cluster 1 - Top 10 reviews:
This app requires too many permissions to access my contacts and location.

Cluster 2 - Top 10 reviews:
Tracks you even when the app is not in use, very creepy.

Cluster 3 - Top 10 reviews:
Constantly asks for permissions to access the camera and microphone.

Cluster 4 - Top 10 reviews:
Sells data to advertisers without explicit consent.

Cluster 5 - Top 10 reviews:
Too invasive, it tracks your location all the time!

Cluster 6 - Top 10 reviews:
Does not respect user's privacy preferences.
The privacy settings are too complicated to understand.

Cluster 7 - Top 10 reviews:
I don't like that this app shares my data with third parties.

Cluster 8 - Top 10 reviews:
The app collects personal information but does not secure it properly.
