In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import os

# Function to load and resize image
def get_image(path, zoom=0.4):
    return OffsetImage(plt.imread(path), zoom=zoom)

# Load the data
data = pd.read_csv('RawData.csv')

# Extract character names and remove from main data
character_names = data.iloc[:, 0].tolist()
main_data = data.iloc[:, 1:]

# Identify numeric and text columns
numeric_columns = main_data.select_dtypes(include=[np.number]).columns
text_columns = main_data.select_dtypes(exclude=[np.number]).columns

# Preprocess numeric columns
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')
numeric_data = pd.DataFrame(scaler.fit_transform(imputer.fit_transform(main_data[numeric_columns])), 
                            columns=numeric_columns)

print(main_data[text_columns].shape)




# Combine processed data
processed_data = pd.concat([numeric_data], axis=1)

# Cluster the processed main data
kmeans = KMeans(n_clusters=8, random_state=42)
main_clusters = kmeans.fit_predict(processed_data)

# Save clustered data to a new file
clustered_data = processed_data.copy()
clustered_data['Cluster'] = main_clusters
clustered_data.to_csv('main_data.csv', index=False)

# Apply TSNE
tsne = TSNE(n_components=3)
tsne_result = tsne.fit_transform(processed_data)

# Combine TSNE results with character names
tsne_df = pd.DataFrame(tsne_result, columns=['x', 'y', 'z'])
tsne_df['Entity'] = character_names
tsne_df.to_csv("pros_data.csv")

# Create a graph with images
plt.figure(figsize=(20, 16))
scatter = plt.scatter(tsne_df['x'], tsne_df['y'], alpha=0)

for i, character in enumerate(tsne_df['Entity']):
    try:
        img_path = os.path.join('images', f"{character.lower().replace(' ', '_')}.png")
        ab = AnnotationBbox(get_image(img_path), (tsne_df['x'][i], tsne_df['y'][i]), frameon=False)
        plt.gca().add_artist(ab)
    except FileNotFoundError:
        print(f"Image not found for {character}")

plt.title('Political Compass', fontsize=36)
plt.xlabel('Component 1', fontsize=36)
plt.ylabel('Component 2', fontsize=36)
plt.tight_layout()
plt.savefig('clustering_images.png', dpi=300, bbox_inches='tight')
plt.close()

print("Processing complete. Results saved to 'clustered_main_data.csv' and 'clustering_images.png'.")

(32, 0)
Image not found for Xi Jingping
Image not found for Martin Luther King Jr.
Image not found for Malcolm X
Image not found for Napoleon
Image not found for Vladmir Putin
Image not found for Vladmir Lenin
Image not found for Karl Marx
Image not found for Joseph Stalin
Image not found for Hugo Chavez
Image not found for Adolf Hitler
Image not found for Augusto Pinochet
Image not found for Julius Caesar
