In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
# Load data
data = pd.read_csv(
    "sample_data/English_Courses_Classified_1937-1939_2024.csv"
)  # Replace with your actual file path

# Prepare data for clustering based on Lecturer and Category
lecturer_category_df = (
    data.groupby(["Lecturer", "Category"]).size().unstack(fill_value=0)
)

# Perform KMeans clustering on lecturers
num_clusters = 5  # Choose an appropriate number of clusters
kmeans_lecturer = KMeans(n_clusters=num_clusters, random_state=0)
lecturer_clusters = kmeans_lecturer.fit_predict(lecturer_category_df)

# Add cluster information to the dataframe
lecturer_category_df["Cluster"] = lecturer_clusters

# Perform PCA for visualization and add Lecturer names
pca = PCA(n_components=2)
pca_result = pca.fit_transform(lecturer_category_df.drop("Cluster", axis=1))
lecturer_category_df["Component 1"] = pca_result[:, 0]
lecturer_category_df["Component 2"] = pca_result[:, 1]
lecturer_category_df["Lecturer"] = lecturer_category_df.index

# Plot with detailed labels for lecturers
plt.figure(figsize=(14, 8))
sns.scatterplot(
    data=lecturer_category_df,
    x="Component 1",
    y="Component 2",
    hue="Cluster",
    palette="viridis",
    s=100,
    edgecolor="k",
)
for i in range(lecturer_category_df.shape[0]):
    plt.text(
        lecturer_category_df["Component 1"][i],
        lecturer_category_df["Component 2"][i],
        lecturer_category_df["Lecturer"][i],
        fontsize=9,
    )
plt.title("Clustering of Lecturers by Course Category")
plt.xlabel("Course Category Dimension 1")
plt.ylabel("Course Category Dimension 2")
plt.legend(title="Cluster")
plt.show()

# Prepare data for clustering based on Category and Lecturers who taught it
category_lecturer_df = (
    data.groupby(["Category", "Lecturer"]).size().unstack(fill_value=0)
)

# Perform KMeans clustering on categories
kmeans_category = KMeans(n_clusters=3, random_state=0)
category_clusters = kmeans_category.fit_predict(category_lecturer_df)

# Add cluster information to the dataframe
category_lecturer_df["Cluster"] = category_clusters

# Perform PCA for visualization and add Category names
pca_category = PCA(n_components=2)
pca_result_category = pca_category.fit_transform(
    category_lecturer_df.drop("Cluster", axis=1)
)
category_lecturer_df["Component 1"] = pca_result_category[:, 0]
category_lecturer_df["Component 2"] = pca_result_category[:, 1]
category_lecturer_df["Category"] = category_lecturer_df.index

# Plot with detailed labels for categories
plt.figure(figsize=(14, 8))
sns.scatterplot(
    data=category_lecturer_df,
    x="Component 1",
    y="Component 2",
    hue="Cluster",
    palette="viridis",
    s=100,
    edgecolor="k",
)
for i in range(category_lecturer_df.shape[0]):
    plt.text(
        category_lecturer_df["Component 1"][i],
        category_lecturer_df["Component 2"][i],
        category_lecturer_df["Category"][i],
        fontsize=9,
    )
plt.title("Clustering of Course Categories by Lecturers")
plt.xlabel("Lecturer Dimension 1")
plt.ylabel("Lecturer Dimension 2")
plt.legend(title="Cluster")
plt.show()