In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import os
import logging
from typing import List, Tuple

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def load_and_preprocess_data(file_path: str) -> Tuple[pd.DataFrame, List[str]]:
    """
    Load data from CSV and preprocess it.
    
    Args:
        file_path (str): Path to the CSV file.
    
    Returns:
        Tuple[pd.DataFrame, List[str]]: Preprocessed data and list of character names.
    """
    logger.info(f"Loading data from {file_path}")
    data = pd.read_csv(file_path)
    
    # Extract character names and remove from main data
    character_names = data.iloc[:, 0].tolist()
    main_data = data.iloc[:, 1:]
    
    logger.info(f"Data loaded. Shape: {main_data.shape}")
    return main_data, character_names

def create_preprocessing_pipeline() -> Pipeline:
    """
    Create a preprocessing pipeline for numeric data.
    
    Returns:
        Pipeline: Scikit-learn pipeline for data preprocessing.
    """
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the data using separate pipelines for numeric and text data.
    
    Args:
        data (pd.DataFrame): Raw input data.
    
    Returns:
        pd.DataFrame: Preprocessed data.
    """
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    text_columns = data.select_dtypes(exclude=[np.number]).columns
    
    # Preprocess numeric data
    numeric_pipeline = create_preprocessing_pipeline()
    numeric_data = pd.DataFrame(
        numeric_pipeline.fit_transform(data[numeric_columns]),
        columns=numeric_columns
    )
    
    # Preprocess text data (if any)
    if not text_columns.empty:
        logger.info(f"Text columns found: {text_columns}")
        tfidf = TfidfVectorizer(max_features=100)  # Adjust max_features as needed
        text_data = pd.DataFrame(
            tfidf.fit_transform(data[text_columns].fillna('').astype(str).agg(' '.join, axis=1)).toarray(),
            columns=[f'tfidf_{i}' for i in range(100)]
        )
        return pd.concat([numeric_data, text_data], axis=1)
    
    return numeric_data

def cluster_data(data: pd.DataFrame, n_clusters: int = 8) -> np.ndarray:
    """
    Perform K-means clustering on the data.
    
    Args:
        data (pd.DataFrame): Preprocessed data.
        n_clusters (int): Number of clusters.
    
    Returns:
        np.ndarray: Cluster labels.
    """
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    return kmeans.fit_predict(data)

def apply_dimensionality_reduction(data: pd.DataFrame) -> np.ndarray:
    """
    Apply t-SNE for dimensionality reduction.
    
    Args:
        data (pd.DataFrame): Preprocessed data.
    
    Returns:
        np.ndarray: Reduced data (3D).
    """
    tsne = KernelPCA(n_components=3, random_state=42)
    return tsne.fit_transform(data)

def get_image(path: str, zoom: float = 0.4) -> OffsetImage:
    """
    Load and resize image for plot annotation.
    
    Args:
        path (str): Path to the image file.
        zoom (float): Zoom factor for the image.
    
    Returns:
        OffsetImage: Resized image for plot annotation.
    """
    return OffsetImage(plt.imread(path), zoom=zoom)

def create_scatter_plot(tsne_df: pd.DataFrame, output_path: str):
    """
    Create a scatter plot with character images.
    
    Args:
        tsne_df (pd.DataFrame): DataFrame with t-SNE results and character names.
        output_path (str): Path to save the output image.
    """
    plt.figure(figsize=(20, 16))
    scatter = plt.scatter(tsne_df['x'], tsne_df['y'], alpha=0)

    for i, character in enumerate(tsne_df['Entity']):
        try:
            img_path = os.path.join('images', f"{character.lower().replace(' ', '_')}.png")
            ab = AnnotationBbox(get_image(img_path), (tsne_df['x'][i], tsne_df['y'][i]), frameon=False)
            plt.gca().add_artist(ab)
        except FileNotFoundError:
            logger.warning(f"Image not found for {character}")

    plt.title('Political Compass', fontsize=36)
    plt.xlabel('Component 1', fontsize=36)
    plt.ylabel('Component 2', fontsize=36)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    logger.info(f"Scatter plot saved to {output_path}")

def main():
    # Load and preprocess data
    main_data, character_names = load_and_preprocess_data('RawData.csv')
    processed_data = preprocess_data(main_data)
    
    # Cluster the processed data
    main_clusters = cluster_data(processed_data)
    
    # Save clustered data
    clustered_data = processed_data.copy()
    clustered_data['Cluster'] = main_clusters
    clustered_data.to_csv('main_data.csv', index=False)
    logger.info("Clustered data saved to 'main_data.csv'")
    
    # Apply dimensionality reduction
    tsne_result = apply_dimensionality_reduction(processed_data)
    
    # Prepare results for visualization
    tsne_df = pd.DataFrame(tsne_result, columns=['x', 'y', 'z'])
    tsne_df['Entity'] = character_names
    tsne_df.to_csv("pros_data.csv", index=False)
    logger.info("t-SNE results saved to 'pros_data.csv'")
    
    # Create and save the scatter plot
    create_scatter_plot(tsne_df, 'clustering_images.png')

if __name__ == "__main__":
    main()
    logger.info("Processing complete.")

2024-08-30 09:04:50,477 - INFO - Loading data from RawData.csv
2024-08-30 09:04:50,480 - INFO - Data loaded. Shape: (32, 20)
2024-08-30 09:04:50,506 - INFO - Clustered data saved to 'main_data.csv'
2024-08-30 09:04:50,510 - INFO - t-SNE results saved to 'pros_data.csv'
2024-08-30 09:04:52,082 - INFO - Scatter plot saved to clustering_images.png
2024-08-30 09:04:52,083 - INFO - Processing complete.
