In [6]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import KernelPCA,FastICA, PCA
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

# Load data
df = pd.read_csv('Smash Ult.csv')

# Extract character names
character_names = df.iloc[:, 0].tolist()
df = df.iloc[:, 1:]
# Preprocess text columns
text_columns = ['Summary', 'Attributes', 'Strengths', 'Weaknesses', 'Overall']
df[text_columns] = df[text_columns].fillna('')  # Replace NaN with empty string

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Perform TF-IDF vectorization for each column individually
tfidf_features = []
for column in text_columns:
    column_tfidf = tfidf.fit_transform(df[column].astype(str))
    tfidf_features.append(column_tfidf)

# Concatenate all TF-IDF features
features = np.hstack([feat.toarray() for feat in tfidf_features])

# Check for and remove any rows with NaN values
features = features[~np.isnan(features).any(axis=1)]

# Cluster main data
if features.shape[0] > 0:
    kmeans = KMeans(n_clusters=min(3, features.shape[0]), random_state=42)
    clusters = kmeans.fit_predict(features)
else:
    print("No valid data for clustering")
    exit()

# Save clustered data
df['Cluster'] = pd.Series(clusters, index=df.index[:len(clusters)])
df.to_csv('clustered_smash_data.csv', index=False)

# Apply t-SNE
tsne = PCA(n_components=2,random_state=42)
tsne_results = tsne.fit_transform(features)

# Combine character names with t-SNE results
tsne_df = pd.DataFrame({'Character': character_names[:len(tsne_results)], 'TSNE1': tsne_results[:, 0], 'TSNE2': tsne_results[:, 1]})

# Cluster t-SNE data
tsne_clusters = KMeans(n_clusters=min(5, tsne_results.shape[0]), random_state=42).fit_predict(tsne_results)
tsne_df['Cluster'] = tsne_clusters

# Visualization
plt.figure(figsize=(12, 8))
scatter = plt.scatter(tsne_df['TSNE1'], tsne_df['TSNE2'], c=tsne_df['Cluster'], cmap='viridis')

for i, txt in enumerate(tsne_df['Character']):
    plt.annotate(txt, (tsne_df['TSNE1'][i], tsne_df['TSNE2'][i]), fontsize=8)

plt.title('t-SNE Visualization of Smash Ultimate Characters')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.tight_layout()
plt.savefig('smash_ultimate_tsne.png',dpi=400)
plt.close()

print("Processing complete. Check 'clustered_smash_data.csv' for clustered data and 'smash_ultimate_tsne.png' for visualization.")

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Processing complete. Check 'clustered_smash_data.csv' for clustered data and 'smash_ultimate_tsne.png' for visualization.
