In [24]:
import numpy as np
from sklearn.datasets import fetch_kddcup99
from sklearn.cluster import KMeans
from sklearn.random_projection import GaussianRandomProjection
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pandas as pd


data = fetch_kddcup99()
D = data.data
encoder = LabelEncoder()


df = pd.DataFrame(D)

categorical_features = [1, 2, 3]
# print("Categorical Features:", categorical_features)

for col in categorical_features:
    df[col] = encoder.fit_transform(df[col])

D_encoded = df.to_numpy()

D = D_encoded
D = np.array(D, dtype=np.float64)

print(D)

n, d = D.shape
print(n, d)

k = 15
reduced_dim = 20

def compute_loss(data, centroids):
    loss = 0
    for point in data:
        nearest_centroid = centroids[np.argmin(np.linalg.norm(point - centroids, axis=1))]
        loss += np.sum((point - nearest_centroid) ** 2)
    return loss

losses_original = []
losses_reduced = []

for trial in range(5):
    # jl_transformer = GaussianRandomProjection(n_components=reduced_dim)
    # E = jl_transformer.fit_transform(D_numeric)

    M = np.random.normal(loc=0.0, scale=1/np.sqrt(reduced_dim), size=(d, reduced_dim))
    E = np.dot(D, M)

    print(E.shape)

    kmeans_reduced = KMeans(n_clusters=k, random_state=42).fit(E)
    A = kmeans_reduced.cluster_centers_
    # labels_reduced = kmeans_reduced.labels_
    A_projected = np.dot(A, np.linalg.pinv(M))
    
    kmeans_original = KMeans(n_clusters=k, random_state=42).fit(D)
    B = kmeans_original.cluster_centers_
    # labels_original = kmeans_original.labels_
    
    
    loss_reduced = compute_loss(D, A_projected)
    loss_original = compute_loss(D, B)
    
    losses_reduced.append(loss_reduced)
    losses_original.append(loss_original)


print(losses_original)
print(losses_reduced)

# trials = range(1, 6)
# plt.bar(trials, losses_original, width=0.4, label='Original Data Loss')
# plt.bar([t + 0.4 for t in trials], losses_reduced, width=0.4, label='Reduced Data Loss')
# plt.xlabel('Trial')
# plt.ylabel('Loss')
# plt.xticks([t + 0.2 for t in trials], labels=trials)
# plt.legend()
# plt.title('Comparison of Clustering Losses')
# plt.show()


[[0.0e+00 1.0e+00 2.2e+01 ... 0.0e+00 0.0e+00 0.0e+00]
 [0.0e+00 1.0e+00 2.2e+01 ... 0.0e+00 0.0e+00 0.0e+00]
 [0.0e+00 1.0e+00 2.2e+01 ... 0.0e+00 0.0e+00 0.0e+00]
 ...
 [0.0e+00 1.0e+00 2.2e+01 ... 1.0e-02 0.0e+00 0.0e+00]
 [0.0e+00 1.0e+00 2.2e+01 ... 1.0e-02 0.0e+00 0.0e+00]
 [0.0e+00 1.0e+00 2.2e+01 ... 1.0e-02 0.0e+00 0.0e+00]]
494021 41
(494021, 20)
(494021, 20)
(494021, 20)
(494021, 20)
(494021, 20)
[3244311308453.997, 3244311308453.997, 3244311308453.997, 3244311308453.997, 3244311308453.997]
[2.223191092525457e+17, 2.5846676191739357e+17, 2.8906780586699526e+17, 2.249260440405625e+17, 2.8535490993890925e+17]


In [25]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_kddcup99
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.sparse import random as sparse_random
from sklearn.preprocessing import LabelEncoder


# Fetch the dataset
data = fetch_kddcup99()
D = data.data
encoder = LabelEncoder()


df = pd.DataFrame(D)

categorical_features = [1, 2, 3]
# print("Categorical Features:", categorical_features)

for col in categorical_features:
    df[col] = encoder.fit_transform(df[col])

D_encoded = df.to_numpy()

D = D_encoded
D = np.array(D, dtype=np.float64)

y= data.target
#label encoding 
y = encoder.fit_transform(y)
y = np.array(y, dtype=np.float64)

# Get dimensions
n, d = D.shape

# Define JL Matrix Generator
def generate_sparse_jl_matrix(rows, cols, sparsity=0.1):
    return sparse_random(rows, cols, density=sparsity, data_rvs=np.random.randn).toarray()

losses_original=[]
losses_reduced=[]

for _ in range(5):
    # Generate sparse JL matrix
    M = generate_sparse_jl_matrix(10, n)

    # Compute projections
    E = M @ D
    z = M @ y

    # Linear regression on original data
    reg_original = LinearRegression().fit(D, y)
    b = reg_original.coef_

    # Linear regression on JL-projected data
    reg_jl = LinearRegression().fit(E, z)
    a = reg_jl.coef_

    # Compute losses
    loss_a = mean_squared_error(y, D @ a)
    loss_b = mean_squared_error(y, D @ b)

    losses_original.append(loss_b)
    losses_reduced.append(loss_a)


print(losses_original)
print(losses_reduced)

# Visualize results
# import pandas as pd
# import matplotlib.pyplot as plt

# # Prepare results for visualization
# df_results = pd.DataFrame(loss_results)
# df_results.index.name = 'Trial'

# # Plot results
# df_results.plot(kind='bar', figsize=(10, 6))
# plt.title('Loss Comparison for Original and JL-Projected Data')
# plt.xlabel('Trial')
# plt.ylabel('Mean Squared Error')
# plt.legend(['Loss (a)', 'Loss (b)'])
# plt.grid(True)
# plt.show()

[58.47988423603355, 58.47988423603355, 58.47988423603355, 58.47988423603355, 58.47988423603355]
[27490.476127654023, 30.90236745401269, 40.78288976264282, 46.407715437351904, 27.09586344940576]
