In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.manifold import TSNE


# Charger les données depuis le fichier Excel
file_path = '../train/classif_test.xlsx'
data = pd.read_excel(file_path)

# Afficher un aperçu des données
print(data.head())
print(data.dtypes)

# Séparer les caractéristiques et les étiquettes
features = data.drop(columns=['ID', 'bug type', 'species'])
labels = data['bug type']

# Imputer les valeurs manquantes avec la moyenne de chaque colonne
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features)

# Standardiser les données
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# Appliquer la PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(features_scaled)

# Créer un DataFrame pour les composantes principales
pca_df = pd.DataFrame(data=principal_components, columns=['Principal Component 1', 'Principal Component 2'])
pca_df['Bug Type'] = labels

# Afficher la variance expliquée
explained_variance = pca.explained_variance_ratio_
print(f"Variance expliquée par les premières 2 composantes: {explained_variance.cumsum()}")

# Graphique PCA
plt.figure(figsize=(10, 8))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Bug Type', data=pca_df, palette='viridis', alpha=0.6)
plt.title('PCA Projection of Bugs by Type')
plt.xlabel(f'Principal Component 1 ({explained_variance[0]:.2f} variance)')
plt.ylabel(f'Principal Component 2 ({explained_variance[1]:.2f} variance)')
plt.legend(title='Bug Type')
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '../train/classif_test.xlsx'

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(features_scaled)


[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 250 samples in 0.001s...
[t-SNE] Computed neighbors for 250 samples in 0.218s...
[t-SNE] Computed conditional probabilities for sample 250 / 250
[t-SNE] Mean sigma: 2.117288
[t-SNE] KL divergence after 250 iterations with early exaggeration: 52.488396
[t-SNE] KL divergence after 300 iterations: 0.722935


In [None]:
# Unique labels and their corresponding colors
unique_labels = np.unique(labels)
colors = plt.cm.get_cmap('viridis', len(unique_labels))

# t-SNE plotting
plt.figure(figsize=(12, 8))
for i, label in enumerate(unique_labels):
    indices = [j for j, x in enumerate(labels) if x == label]
    plt.scatter(tsne_results[indices, 0], tsne_results[indices, 1], color=colors(i), label=label, alpha=0.5)

plt.title('t-SNE Projection of Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

AttributeError: module 'matplotlib.cm' has no attribute 'get_cmap'