In [1]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.cluster import KMeans
import torch
from torch import nn
from torch.optim import Adam

In [2]:
# Autoencoder for numerical data
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# Step 1: Load the datasets
data_files = []
for root, dirs, files in os.walk('data'):
    for file in files:
        if file.endswith('.csv'):
            data_files.append(os.path.join(root, file))

In [3]:
# Initialize the results list
results = []

for file in data_files:
    # Create a dictionary for each dataset
    result = {'dataset_name': file}

    # Load the dataset
    dataset = pd.read_csv(file)
    result['data'] = dataset

    # One-Hot Encoding for categorical features
    dataset = pd.get_dummies(dataset)

    # PCA for numerical data
    pca = PCA(n_components=2)
    result['pca_embeddings'] = pca.fit_transform(dataset)

    # t-SNE for numerical data
    tsne = TSNE(n_components=2)
    result['t-SNE_embeddings'] = tsne.fit_transform(dataset)

    # UMAP for numerical data
    umap = UMAP(n_components=2)
    result['umap_embeddings'] = umap.fit_transform(dataset)

    # Define the size of the encoded representations
    encoding_dim = 32

    # Define the autoencoder model
    autoencoder = Autoencoder(dataset.shape[1], encoding_dim)

    # Define the optimizer and loss function
    optimizer = Adam(autoencoder.parameters())
    criterion = nn.BCELoss()

    # Convert the dataset to PyTorch tensors
    dataset_torch = torch.tensor(dataset.values, dtype=torch.float32)
    
    # Normalize the data to be between 0 and 1
    dataset_torch = (dataset_torch - dataset_torch.min()) / (dataset_torch.max() - dataset_torch.min())
    
    # Train the autoencoder
    for epoch in range(50):
        autoencoder.train()
        optimizer.zero_grad()
        outputs = autoencoder(dataset_torch)
        loss = criterion(outputs, dataset_torch)
        loss.backward()
        optimizer.step()

    # Switch the model to evaluation mode
    autoencoder.eval()

    # Generate the embeddings
    with torch.no_grad():
        result['autoencoder_embeddings'] = autoencoder.encoder(dataset_torch).numpy()


    # Clusterize the embeddings
    kmeans = KMeans(n_clusters=10)
    result['pca_cluster'] = kmeans.fit_predict(result['pca_embeddings'])
    result['t-SNE_cluster'] = kmeans.fit_predict(result['t-SNE_embeddings'])
    result['umap_cluster'] = kmeans.fit_predict(result['umap_embeddings'])
    result['autoencoder_cluster'] = kmeans.fit_predict(result['autoencoder_embeddings'])

    # Append the result to the results list
    results.append(result)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [None]:
result