In [None]:
!pip install -U sentence-transformers
!pip install umap-learn

In [None]:
from tqdm import trange
from datetime import datetime
import random
import umap
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer, util
from google.colab import files

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
aspects1 = pd.read_csv('PyABSA_FinalOutputWithoutCategory.csv')
aspects2 = pd.read_csv('Approch2_FinalOutputWithoutCategory.csv')
aspects = pd.concat([aspects1, aspects2])

In [None]:
aspects

Unnamed: 0,Category,Language,Aspect,Count,Negative,Neutral,Positive
0,PostDoc,English,system,69,58,6,5
1,PostDoc,English,job,32,13,16,3
2,PostDoc,English,staff,30,18,9,3
3,PostDoc,English,thread,27,2,3,22
4,PostDoc,English,position,27,6,19,2
...,...,...,...,...,...,...,...
481,Unknown,German,wissenschaftler,201,142,44,15
482,Unknown,German,wissenschaftssystem,53,36,13,4
483,Unknown,German,woche,95,47,34,14
484,Unknown,German,zeit,190,119,50,21


In [None]:
agg_dict = {"Count": ["sum"]}
aspectList = aspects.groupby("Aspect").agg(agg_dict)
aspectList.columns = aspectList.columns.droplevel(1)
aspectList = aspectList.reset_index()

In [None]:
aspectList

Unnamed: 0,Aspect,Count
0,academia,848
1,academic,534
2,account,11
3,ali,14
4,amp,726
...,...,...
241,wort,205
242,year,518
243,zeit,1264
244,zukunft,197


In [None]:
aspect_embeddings = model.encode(aspectList["Aspect"], show_progress_bar=True)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
def generate_clusters(n_clusters, input_embeddings):
    return KMeans(n_clusters = n_clusters, n_init=10).fit(input_embeddings)

def generate_umap_embeddings(embeddings, n_neighbors, n_components):

    random_state = 42

    return umap.UMAP(n_neighbors=n_neighbors,
                      n_components=n_components, 
                      metric='euclidean', 
                      random_state=random_state).fit_transform(embeddings)

def score_clusters(embeddings, clusters):
    
    labels, counts = np.unique(clusters.labels_, return_counts=True)
    total_num = len(clusters.labels_)
    # Percentage of unclassified points
    if labels[0] == -1:
        cost = counts[0]/total_num
    else:
        cost = 0
    
    # Clustering score
    if len(labels) > 2:
        score = silhouette_score(embeddings, clusters.labels_)
    else:
        score = 0
    
    return score, cost, len(labels)

def random_search(embeddings = None, cluster_params_space = None, use_umap = False, umap_params_space=None, num_evals = 10, print_time_logs = False):
    
    results = []
    
    if print_time_logs:
        start_time = datetime.now()
        print(start_time)
    
    for i in trange(num_evals):
        
        # Umap specific params
        if use_umap:
            n_neighbors = random.choice(umap_params_space['n_neighbors'])
            n_components = random.choice(umap_params_space['n_components'])
            
#             print('n_neighbors-' + str(n_neighbors))
#             print('n_components-' + str(n_components))
            
            updated_embeddings = generate_umap_embeddings(embeddings = embeddings,
                                                          n_neighbors = n_neighbors, 
                                                          n_components = n_components)
            if print_time_logs:
                end_time = datetime.now()
                print("UMAP Embeddings took " + str(end_time - start_time))
                start_time = datetime.now()
        else:
            updated_embeddings = embeddings
        
        # Clustering specific params
        n_clusters = random.choice(cluster_params_space['n_clusters'])
        clusters = generate_clusters(n_clusters=n_clusters,
                                     input_embeddings = updated_embeddings)
        
        if print_time_logs:
            end_time = datetime.now()
            print("Clustering took " + str(end_time - start_time))
            start_time = datetime.now()
    
        score, cost, label_count = score_clusters(updated_embeddings, clusters)
        
        if print_time_logs:
            end_time = datetime.now()
            print("Scoring took " + str(end_time - start_time))
            start_time = datetime.now()
        
        if use_umap:
            results.append([n_neighbors, n_components, n_clusters, score, cost, label_count])
        else:
            results.append([n_clusters, score, cost, label_count])
    
    if use_umap:
        result_df = pd.DataFrame(results, columns=['n_neighbors', 'n_components', 
                                                   'n_clusters', 'score', 'cost', 'label_count'])
    else:
        result_df = pd.DataFrame(results, columns=['n_clusters', 'score', 'cost', 'label_count'])
    
    return result_df.sort_values(by='score', ascending=False)

In [None]:
cluster_params_space = {
    "n_clusters": range(10, 40)
}

umap_params_space = {
    "n_neighbors": range(2, 15),
    "n_components": range(2, 15)
}

random_search_output = random_search(embeddings=aspect_embeddings,
                                     cluster_params_space=cluster_params_space,
                                     use_umap=True,
                                     umap_params_space=umap_params_space,
                                     num_evals=30)

100%|██████████| 30/30 [00:59<00:00,  1.97s/it]


In [None]:
random_search_output

Unnamed: 0,n_neighbors,n_components,n_clusters,score,cost,label_count
15,2,6,33,0.655057,0,33
18,3,11,31,0.655001,0,31
19,3,9,39,0.629541,0,39
14,3,12,20,0.577579,0,20
2,4,13,30,0.548848,0,30
16,6,10,36,0.528296,0,36
3,4,8,39,0.522271,0,39
25,5,14,37,0.521695,0,37
7,2,9,20,0.508059,0,20
29,10,4,39,0.486649,0,39


In [None]:
best_params = random_search_output.loc[15]
# cluster_params_space = {
#     "n_clusters": 30
# }

# umap_params_space = {
#     "n_neighbors": 3,
#     "n_components": 14
# }

print("Computing output for following params-")
print(best_params)

umap_embeddings = generate_umap_embeddings(embeddings = aspect_embeddings,
                                           n_neighbors = int(best_params['n_neighbors']), 
                                           n_components = int(best_params['n_components']))

# Clustering specific params
clusters = generate_clusters(n_clusters = int(best_params['n_clusters']),
                             input_embeddings = umap_embeddings)

df = pd.DataFrame({'Aspect': aspectList["Aspect"], 'Cluster': clusters.labels_, 'Frequency': aspectList['Count']})
# unique, counts = np.unique(clusters.labels_, return_counts=True)
agg_dict = {'Aspect': ['count'], 'Frequency': ['sum']}
df1 = df.groupby('Cluster').agg(agg_dict).reset_index()
# print(np.asarray((unique, counts)).T)
df1

Computing output for following params-
n_neighbors      2.000000
n_components     6.000000
n_clusters      33.000000
score            0.655057
cost             0.000000
label_count     33.000000
Name: 15, dtype: float64


Unnamed: 0_level_0,Cluster,Aspect,Frequency
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,0,6,997
1,1,8,5639
2,2,10,2013
3,3,20,6149
4,4,7,316
5,5,4,1235
6,6,30,8711
7,7,5,1213
8,8,12,1543
9,9,10,2367


In [None]:
df[df['Cluster'] == 0]

Unnamed: 0,Aspect,Cluster,Frequency
10,arbeitsbedingungen,0,276
46,condition,0,103
47,conditions,0,32
185,situation,0,562
186,staat,0,13
239,working conditions,0,11


In [None]:
idx = df.groupby('Cluster')['Frequency'].transform(max) == df['Frequency']
cateogries = df[idx].sort_values('Cluster')[['Aspect', 'Cluster']]
cateogries

Unnamed: 0,Aspect,Cluster
185,situation,0
100,jahr,1
146,politik,2
7,arbeit,3
65,ergebnis,4
149,postdoc,5
74,forschung,6
22,befristung,7
137,nachwuchs,8
158,professur,9


In [None]:
final_categories = df.merge(cateogries, on='Cluster')
final_categories

Unnamed: 0,Aspect_x,Cluster,Frequency,Aspect_y
0,academia,17,848,hochschule
1,academic,17,534,hochschule
2,hochschule,17,1112,hochschule
3,hochschulen,17,112,hochschule
4,semester,17,170,hochschule
...,...,...,...,...
241,wissenschaftlerinnen,13,26,wissenschaft
242,wissenschaftssystem,13,499,wissenschaft
243,track,32,139,track
244,way,32,64,track


In [None]:
final_categories.rename(columns={"Aspect_x": "Aspect", "Aspect_y": "Aspect_Category"}, inplace=True)

In [None]:
final_categories

Unnamed: 0,Aspect,Cluster,Frequency,Aspect_Category
0,academia,17,848,hochschule
1,academic,17,534,hochschule
2,hochschule,17,1112,hochschule
3,hochschulen,17,112,hochschule
4,semester,17,170,hochschule
...,...,...,...,...
241,wissenschaftlerinnen,13,26,wissenschaft
242,wissenschaftssystem,13,499,wissenschaft
243,track,32,139,track
244,way,32,64,track


In [None]:
final_categories.to_csv("Final_Aspect_Categories.csv", index=False)
files.download("Final_Aspect_Categories.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Assign Categories to Aspects

In [None]:
import pandas as pd
from google.colab import files
final_categories = pd.read_csv("Final_Aspect_Categories.csv")

In [None]:
aspectList1 = pd.read_csv('PyABSA_FinalOutputWithoutCategory.csv')
finalList1 = aspectList1.merge(final_categories[['Aspect', 'Aspect_Category']], on='Aspect')
finalList1 = finalList1[['Category', 'Language', 'Aspect', 'Count', 'Aspect_Category', 'Negative', 'Neutral', 'Positive']]

finalList1.to_csv("PyABSA_FinalOutputWithCategory.csv", index=False)
files.download("PyABSA_FinalOutputWithCategory.csv")

In [None]:
finalList1

Unnamed: 0,Category,Language,Aspect,Count,Aspect_Category,Negative,Neutral,Positive
0,PostDoc,English,system,69,system,58,6,5
1,PostDoc,German,system,177,system,151,12,14
2,PhDStudent,German,system,44,system,34,5,5
3,Prof,English,system,33,system,24,8,1
4,Prof,German,system,141,system,121,8,12
...,...,...,...,...,...,...,...,...
377,Unknown,German,lehrer,11,professur,7,2,2
378,Unknown,German,beruf,11,forschung,6,3,2
379,Unknown,German,tag,11,tag,4,4,3
380,Unknown,German,account,11,forschung,8,2,1


In [None]:
aspectList2 = pd.read_csv('Approch2_FinalOutputWithoutCategory.csv')
finalList2 = aspectList2.merge(final_categories[['Aspect', 'Aspect_Category']], on='Aspect')
finalList2 = finalList2[['Category', 'Language', 'Aspect', 'Count', 'Aspect_Category', 'Negative', 'Neutral', 'Positive']]

finalList2.to_csv("Approch2_FinalOutputWithCategory.csv", index=False)
files.download("Approch2_FinalOutputWithCategory.csv")

In [None]:
finalList2

Unnamed: 0,Category,Language,Aspect,Count,Aspect_Category,Negative,Neutral,Positive
0,Others,English,academia,89,hochschule,43,28,18
1,PhD Student,English,academia,82,hochschule,47,12,23
2,PostDoc,English,academia,339,hochschule,200,65,74
3,Professor,English,academia,189,hochschule,122,24,43
4,Unknown,English,academia,110,hochschule,72,21,17
...,...,...,...,...,...,...,...,...
481,Professor,German,habil,61,bereich,43,11,7
482,Professor,German,liebe,53,bereich,26,14,13
483,Professor,German,punkt,90,forschung,43,30,17
484,Professor,German,sicherheit,56,chance,31,16,9


In [None]:
aspectList3 = pd.read_csv('Approch1_FinalOutputWithoutCategory.csv')
finalList3 = aspectList3.merge(final_categories[['Aspect', 'Aspect_Category']], on='Aspect')

finalList3.to_csv("Approch1_FinalOutputWithCategory.csv", index=False)
files.download("Approch1_FinalOutputWithCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
finalList3

Unnamed: 0,Language,Category,Aspect,Count,Negative,Neutral,Positive,Negative_Probability,Neutral_Probability,Positive_Probability,Aspect_Category
0,English,Others,academia,89,36,43,10,0.431878,0.369647,0.198475,hochschule
1,English,PhD Student,academia,82,46,16,20,0.456661,0.313883,0.229457,hochschule
2,English,PostDoc,academia,339,133,166,40,0.431449,0.377166,0.191385,hochschule
3,English,Professor,academia,189,86,77,26,0.479376,0.340549,0.180076,hochschule
4,English,Unknown,academia,110,63,39,8,0.517638,0.339126,0.143236,hochschule
...,...,...,...,...,...,...,...,...,...,...,...
481,German,Professor,habil,61,33,24,4,0.470883,0.378086,0.151030,bereich
482,German,Professor,liebe,53,10,15,28,0.313172,0.329581,0.357246,bereich
483,German,Professor,punkt,90,21,47,22,0.287479,0.426971,0.285550,forschung
484,German,Professor,sicherheit,56,13,37,6,0.355706,0.438087,0.206207,chance
