In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import os
import spacy

In [6]:
# Function to load and resize image
def get_image(path, zoom=0.3):
    return OffsetImage(plt.imread(path), zoom=zoom)
def preprocess_data():
    # Load spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Function to lemmatize text using spaCy
    def lemmatize_text(text):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc])
    

    # Load the data
    data = pd.read_csv('raw_data.csv')

    # Extract character names and remove from main data
    character_names = data.iloc[:, 0].tolist()
    main_data = data.iloc[:, 1:]

    # Identify numeric and text columns
    numeric_columns = main_data.select_dtypes(include=[np.number]).columns
    text_columns = main_data.select_dtypes(exclude=[np.number]).columns

    # Preprocess numeric columns
    scaler = StandardScaler()
    imputer = SimpleImputer(strategy='mean')
    numeric_data = pd.DataFrame(scaler.fit_transform(imputer.fit_transform(main_data[numeric_columns])), 
                                columns=numeric_columns)

    print(main_data[text_columns].shape)

    # Preprocess text columns using TfidfVectorizer and spaCy lemmatization
    tfidf = TfidfVectorizer()  # You can adjust max_features as needed
    text_data = main_data[text_columns].fillna('')
    text_data_combined = text_data.apply(lambda x: ' '.join(x), axis=1)
    lemmatized_text = text_data_combined.apply(lemmatize_text)
    tfidf_matrix = tfidf.fit_transform(lemmatized_text)
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

    # Combine processed data
    processed_data = pd.concat([numeric_data,tfidf_df], axis=1)
    return [processed_data,character_names]

def decompose_data(processed_data,character_names):
    # Apply TSNE
    processed_data

    tsne = PCA(n_components=1)
    tsne_1d = tsne.fit_transform(processed_data)
    tsne_1d = pd.DataFrame(tsne_1d, columns=['x'])
    tsne_1d.insert(loc = 0,
          column = "Character",
          value = character_names)
    tsne_1d.to_csv("data-1d.csv",index=False)

    tsne2 = PCA(n_components=2)
    tsne_2d = tsne2.fit_transform(processed_data)
    tsne_2d = pd.DataFrame(tsne_2d, columns=['x', 'y'])
    tsne_2d.insert(loc = 0,
          column = "Character",
          value = character_names)
    tsne_2d.to_csv("data-2d.csv",index=False)

    tsne3 = KernelPCA(n_components=3)
    tsne_3d = tsne3.fit_transform(processed_data)
    tsne_3d = pd.DataFrame(tsne_3d, columns=['x', 'y', 'z'])
    tsne_3d.insert(loc = 0,
          column = "Character",
          value = character_names)
    tsne_3d.to_csv("data-3d.csv",index=False)

    # Create a graph with images
    plt.figure(figsize=(20, 16))
    scatter = plt.scatter(tsne_2d['x'], tsne_2d['y'], alpha=0)

    for i, character in enumerate(tsne_2d['Character']):
        try:
            img_path = os.path.join('pokemon_images', f"{character.lower().replace(' ', '_')}.png")
            ab = AnnotationBbox(get_image(img_path), (tsne_2d['x'][i], tsne_2d['y'][i]), frameon=False)
            plt.gca().add_artist(ab)
        except FileNotFoundError:
            print(f"Image not found for {character}")

    plt.title('KPCA Clustering of Pokemon Characters', fontsize=36)
    plt.xlabel('Component 1', fontsize=36)
    plt.ylabel('Component 2', fontsize=36)
    plt.tight_layout()
    plt.savefig('KPCA_clustering_images.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("Processing complete. Results saved to 'clustered_main_data.csv' and 'TSNE_clustering_images.png'.")

In [7]:
df,chars = preprocess_data()
decompose_data(df,chars)

(68, 22)
Processing complete. Results saved to 'clustered_main_data.csv' and 'TSNE_clustering_images.png'.


In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np

def perform_cluster_analysis(df, output_csv_path, elbow_graph_path, max_clusters=10):
    df = df.fillna(-1)
    # Step 1: Separate labels (assumed to be in the first column)
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:]

    # Step 2: Standardize the data (since KMeans is sensitive to scale)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)

    # Step 3: Perform the Elbow Method to find the optimal number of clusters
    inertia = []
    cluster_range = range(1, max_clusters+1)
    
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(scaled_data)
        inertia.append(kmeans.inertia_)
    
    # Step 4: Plot the Elbow graph and save it
    plt.figure(figsize=(8, 6))
    plt.plot(cluster_range, inertia, 'bo-', color='b')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('Inertia')
    plt.title('Elbow Method for Optimal k')
    plt.savefig(elbow_graph_path)
    plt.close()

    # Step 5: Choose the optimal number of clusters manually (for now, based on the Elbow graph)
    # Alternatively, you could implement automatic elbow detection with additional logic
    optimal_k = int(input("Enter the optimal number of clusters based on the elbow graph: "))

    # Step 6: Perform KMeans clustering with the chosen number of clusters
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)

    df = df.fillna(-1)
    # Step 7: Create a new dataframe to hold the labels and cluster assignments
    df['clusters'] = clusters

    # Step 8: Save the new dataframe with cluster assignments to CSV
    df.to_csv(output_csv_path, index=False)
    
    print(f"Cluster analysis complete. Results saved to {output_csv_path} and Elbow graph to {elbow_graph_path}.")

# Test Raw Data
#df = pd.read_csv('raw_data.csv')
#perform_cluster_analysis(df, 'raw_clustered_output.csv', 'raw_elbow_graph.png')
# Test 1-D Data
df = pd.read_csv('data-1d.csv')
perform_cluster_analysis(df, '1d_clustered_output.csv', '1d_elbow_graph.png')


  plt.plot(cluster_range, inertia, 'bo-', color='b')


Cluster analysis complete. Results saved to 1d_clustered_output.csv and Elbow graph to 1d_elbow_graph.png.
