In [2]:
import pandas as pd

# Load the exported dataset
working_data = pd.read_csv('simplified_working_data.csv')

In [3]:
print(working_data)

     ProjectReference                                              Title  \
0        AH/X000516/1                                    India 75 Events   
1        AH/N006011/1  The action-based brain: a provocation to philo...   
2        AH/M010813/1  Imprint. A Forensic and Historical Investigati...   
3        AH/K007084/1  Structural and typological variation in the di...   
4        AH/L007290/1  Judging images: the making, management and con...   
...               ...                                                ...   
2265     AH/M010589/1  Digital Technology and Human Vulnerability: To...   
2266     AH/P005039/1  Pinter Histories and Legacies: The Impact of H...   
2267     AH/K005936/1  Cultures, Communities and Connections in the H...   
2268     AH/V015761/1  Licensing System for 3D Printing in China - im...   
2269     AH/N504518/1  AHRC Leadership Fellowship Connected Communiti...   

       StartDate     EndDate  AwardPounds  \
0     2022-02-01  2022-12-31        40175 

In [7]:
from sklearn.cluster import KMeans
from tqdm import tqdm

working_data.fillna(0, inplace=True)

X = working_data.iloc[:, 7:]

k = 15

# Initialize K-Means model outside of the tqdm block
kmeans = KMeans(n_clusters=k, random_state=0)

# Wrap the fitting process with tqdm
with tqdm(total=100) as pbar:
    pbar.set_description("K-Means Clustering")
    working_data['Cluster'] = kmeans.fit_predict(X)
    pbar.update(100)  # Update the progress bar to 100% when done



  super()._check_params_vs_input(X, default_n_init=10)
K-Means Clustering: 100%|███████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 187.65it/s]


In [8]:
print(working_data)

     ProjectReference                                              Title  \
0        AH/X000516/1                                    India 75 Events   
1        AH/N006011/1  The action-based brain: a provocation to philo...   
2        AH/M010813/1  Imprint. A Forensic and Historical Investigati...   
3        AH/K007084/1  Structural and typological variation in the di...   
4        AH/L007290/1  Judging images: the making, management and con...   
...               ...                                                ...   
2265     AH/M010589/1  Digital Technology and Human Vulnerability: To...   
2266     AH/P005039/1  Pinter Histories and Legacies: The Impact of H...   
2267     AH/K005936/1  Cultures, Communities and Connections in the H...   
2268     AH/V015761/1  Licensing System for 3D Printing in China - im...   
2269     AH/N504518/1  AHRC Leadership Fellowship Connected Communiti...   

       StartDate     EndDate  AwardPounds  \
0     2022-02-01  2022-12-31        40175 

In [13]:
cluster_2_titles = working_data.loc[working_data['Cluster'] == 2, 'Title']

# Print the list of titles
print(cluster_2_titles)

31      Feeling Towns: the role of place and identity ...
48      Digital Engagement for Heritage-led Urban Rege...
54      Creating Space: A re-evaluation of the role of...
63                                Voices of War and Peace
91      Heritage-led urban regeneration: a scalable mo...
                              ...                        
2107    Creating a UK community partner network: build...
2141    Women's leadership in designing social innovat...
2228                                   Creative Peninsula
2249    Performing Abergavenny: creating a connected c...
2263            Our Criminal Past: Our Criminal Ancestors
Name: Title, Length: 77, dtype: object


In [25]:
# Calculate the cluster counts
cluster_counts = working_data['Cluster'].value_counts().reset_index()
cluster_counts.columns = ['Cluster', 'Count']  # Rename columns for clarity

print(cluster_counts)

    Cluster  Count
0        11    273
1         4    251
2        13    219
3        10    187
4        14    175
5         0    143
6         8    139
7         7    136
8         3    128
9         6    122
10        9    117
11        1    117
12        5    100
13       12     86
14        2     77


In [30]:
# Select the relevant columns (columns 8 through 40 for topic scores)
topic_columns = working_data.columns[7:39]

# Group the DataFrame by 'Cluster' and calculate the mean for each topic score
cluster_means = working_data.groupby('Cluster')[topic_columns].mean()

cluster_means = cluster_means.round(2)

# Merge cluster_means and cluster_counts on 'Cluster' column
cluster_means = pd.merge(cluster_means, cluster_counts, on='Cluster')

# Print the resulting DataFrame
print(cluster_means)


    Cluster  Topic 1  Topic 2  Topic 3  Topic 4  Topic 5  Topic 6  Topic 7  \
0         0     0.01     0.00     0.00     0.00      0.0     0.14     0.07   
1         1     0.00     0.00     0.00     0.00      0.0     0.66     0.04   
2         2     0.01     0.00     0.00     0.00      0.0     0.19     0.07   
3         3     0.02     0.00     0.01     0.00      0.0     0.04     0.04   
4         4     0.02     0.00     0.00     0.01      0.0     0.38     0.09   
5         5     0.01     0.00     0.00     0.00      0.0     0.03     0.74   
6         6     0.02     0.01     0.02     0.04      0.0     0.02     0.02   
7         7     0.04     0.01     0.00     0.01      0.0     0.08     0.05   
8         8     0.03     0.00     0.00     0.00      0.0     0.06     0.06   
9         9     0.38     0.00     0.00     0.00      0.0     0.06     0.13   
10       10     0.01     0.00     0.00     0.01      0.0     0.04     0.01   
11       11     0.02     0.00     0.00     0.01      0.0     0.2

In [31]:
cluster_means.to_csv("cluster_means.csv", index = False)
working_data.to_csv("working_data_with_topic_distribution.csv", index = False)