In [None]:
!pip install -U sentence-transformers
!pip install umap-learn

In [None]:
from tqdm import trange
from datetime import datetime
import random
import umap
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer, util
from google.colab import files

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

Downloading (…)5f450/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading (…)966465f450/README.md:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading (…)6465f450/config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5f450/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading (…)966465f450/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)465f450/modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

In [None]:
aspects1 = pd.read_csv('PyABSA_FinalOutputWithoutCategory.csv')
aspects2 = pd.read_csv('Approch1_FinalOutputWithoutCategory.csv')
aspects = pd.concat([aspects1, aspects2])

In [None]:
aspects

Unnamed: 0,Category,Language,Aspect,Count,Negative,Neutral,Positive,Negative_Probability,Neutral_Probability,Positive_Probability
0,PostDoc,English,system,69,58,6,5,,,
1,PostDoc,English,job,32,13,16,3,,,
2,PostDoc,English,staff,30,18,9,3,,,
3,PostDoc,English,thread,27,2,3,22,,,
4,PostDoc,English,position,27,6,19,2,,,
...,...,...,...,...,...,...,...,...,...,...
476,Unknown,German,wissenschaftler,201,96,92,13,0.464702,0.415603,0.119695
477,Unknown,German,wissenschaftssystem,53,28,21,4,0.517908,0.370315,0.111777
478,Unknown,German,woche,95,26,56,13,0.363728,0.442244,0.194029
479,Unknown,German,zeit,190,79,96,15,0.420591,0.413967,0.165442


In [None]:
agg_dict = {"Count": ["sum"]}
aspectList = aspects.groupby("Aspect").agg(agg_dict)
aspectList.columns = aspectList.columns.droplevel(1)
aspectList = aspectList.reset_index()

In [None]:
aspectList

Unnamed: 0,Aspect,Count
0,academia,847
1,academic,534
2,account,11
3,ali,14
4,antrag,167
...,...,...
241,wort,205
242,year,519
243,zeit,1264
244,zukunft,198


In [None]:
aspect_embeddings = model.encode(aspectList["Aspect"], show_progress_bar=True)

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
def generate_clusters(n_clusters, input_embeddings):
    return KMeans(n_clusters = n_clusters, n_init=10).fit(input_embeddings)

def generate_umap_embeddings(embeddings, n_neighbors, n_components):

    random_state = 42

    return umap.UMAP(n_neighbors=n_neighbors,
                      n_components=n_components, 
                      metric='euclidean', 
                      random_state=random_state).fit_transform(embeddings)

def score_clusters(embeddings, clusters):
    
    labels, counts = np.unique(clusters.labels_, return_counts=True)
    total_num = len(clusters.labels_)
    # Percentage of unclassified points
    if labels[0] == -1:
        cost = counts[0]/total_num
    else:
        cost = 0
    
    # Clustering score
    if len(labels) > 2:
        score = silhouette_score(embeddings, clusters.labels_)
    else:
        score = 0
    
    return score, cost, len(labels)

def random_search(embeddings = None, cluster_params_space = None, use_umap = False, umap_params_space=None, num_evals = 10, print_time_logs = False):
    
    results = []
    
    if print_time_logs:
        start_time = datetime.now()
        print(start_time)
    
    for i in trange(num_evals):
        
        # Umap specific params
        if use_umap:
            n_neighbors = random.choice(umap_params_space['n_neighbors'])
            n_components = random.choice(umap_params_space['n_components'])
            
#             print('n_neighbors-' + str(n_neighbors))
#             print('n_components-' + str(n_components))
            
            updated_embeddings = generate_umap_embeddings(embeddings = embeddings,
                                                          n_neighbors = n_neighbors, 
                                                          n_components = n_components)
            if print_time_logs:
                end_time = datetime.now()
                print("UMAP Embeddings took " + str(end_time - start_time))
                start_time = datetime.now()
        else:
            updated_embeddings = embeddings
        
        # Clustering specific params
        n_clusters = random.choice(cluster_params_space['n_clusters'])
        clusters = generate_clusters(n_clusters=n_clusters,
                                     input_embeddings = updated_embeddings)
        
        if print_time_logs:
            end_time = datetime.now()
            print("Clustering took " + str(end_time - start_time))
            start_time = datetime.now()
    
        score, cost, label_count = score_clusters(updated_embeddings, clusters)
        
        if print_time_logs:
            end_time = datetime.now()
            print("Scoring took " + str(end_time - start_time))
            start_time = datetime.now()
        
        if use_umap:
            results.append([n_neighbors, n_components, n_clusters, score, cost, label_count])
        else:
            results.append([n_clusters, score, cost, label_count])
    
    if use_umap:
        result_df = pd.DataFrame(results, columns=['n_neighbors', 'n_components', 
                                                   'n_clusters', 'score', 'cost', 'label_count'])
    else:
        result_df = pd.DataFrame(results, columns=['n_clusters', 'score', 'cost', 'label_count'])
    
    return result_df.sort_values(by='score', ascending=False)

In [None]:
cluster_params_space = {
    "n_clusters": range(10, 40)
}

umap_params_space = {
    "n_neighbors": range(2, 15),
    "n_components": range(10, 25)
}

random_search_output = random_search(embeddings=aspect_embeddings,
                                     cluster_params_space=cluster_params_space,
                                     use_umap=True,
                                     umap_params_space=umap_params_space,
                                     num_evals=30)

100%|██████████| 30/30 [00:44<00:00,  1.48s/it]


In [None]:
random_search_output

Unnamed: 0,n_neighbors,n_components,n_clusters,score,cost,label_count
12,3,22,33,0.646961,0,33
27,4,13,34,0.586909,0,34
22,2,10,27,0.573776,0,27
9,3,16,18,0.561004,0,18
5,4,10,38,0.554423,0,38
2,5,23,31,0.50981,0,31
7,7,10,28,0.504473,0,28
29,5,11,25,0.504023,0,25
3,5,22,16,0.503679,0,16
25,6,22,22,0.50351,0,22


In [None]:
best_params = random_search_output.loc[12]
# cluster_params_space = {
#     "n_clusters": 30
# }

# umap_params_space = {
#     "n_neighbors": 3,
#     "n_components": 14
# }

print("Computing output for following params-")
print(best_params)

umap_embeddings = generate_umap_embeddings(embeddings = aspect_embeddings,
                                           n_neighbors = int(best_params['n_neighbors']), 
                                           n_components = int(best_params['n_components']))

# Clustering specific params
clusters = generate_clusters(n_clusters = int(best_params['n_clusters']),
                             input_embeddings = umap_embeddings)

df = pd.DataFrame({'Aspect': aspectList["Aspect"], 'Cluster': clusters.labels_, 'Frequency': aspectList['Count']})
# unique, counts = np.unique(clusters.labels_, return_counts=True)
agg_dict = {'Aspect': ['count'], 'Frequency': ['sum']}
df1 = df.groupby('Cluster').agg(agg_dict).reset_index()
# print(np.asarray((unique, counts)).T)
df1

Computing output for following params-
n_neighbors      3.000000
n_components    22.000000
n_clusters      33.000000
score            0.646961
cost             0.000000
label_count     33.000000
Name: 12, dtype: float64


Unnamed: 0_level_0,Cluster,Aspect,Frequency
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum
0,0,12,5593
1,1,10,8147
2,2,4,1615
3,3,3,150
4,4,5,2094
5,5,3,705
6,6,21,6152
7,7,4,89
8,8,7,1814
9,9,3,110


In [None]:
df[df['Cluster'] == 14]

Unnamed: 0,Aspect,Cluster,Frequency
31,bezahlung,14,60
69,finanzierung,14,129
78,geld,14,799
88,grundfinanzierung,14,47
111,kosten,14,52
135,money,14,12
151,preis,14,23
230,wert,14,30


In [None]:
idx = df.groupby('Cluster')['Frequency'].transform(max) == df['Frequency']
cateogries = df[idx].sort_values('Cluster')[['Aspect', 'Cluster']]
cateogries

Unnamed: 0,Aspect,Cluster
100,jahr,0
232,wissenschaft,1
243,zeit,2
64,ergebnis,3
122,leute,4
173,research,5
6,arbeit,6
109,konferenz,7
126,mensch,8
128,ministerin,9


In [None]:
final_categories = df.merge(cateogries, on='Cluster')
final_categories

Unnamed: 0,Aspect_x,Cluster,Frequency,Aspect_y
0,academia,31,847,academia
1,academic,31,534,academia
2,semester,31,172,academia
3,student,31,134,academia
4,studierenden,31,11,academia
...,...,...,...,...
241,wissenschaftlerinnen,1,26,wissenschaft
242,wissenschaftssystem,1,499,wissenschaft
243,un,32,141,uni
244,uni,32,1944,uni


In [None]:
final_categories.rename(columns={"Aspect_x": "Aspect", "Aspect_y": "Aspect_Category"}, inplace=True)

In [None]:
final_categories

Unnamed: 0,Aspect,Cluster,Frequency,Aspect_Category
0,academia,31,847,academia
1,academic,31,534,academia
2,semester,31,172,academia
3,student,31,134,academia
4,studierenden,31,11,academia
...,...,...,...,...
241,wissenschaftlerinnen,1,26,wissenschaft
242,wissenschaftssystem,1,499,wissenschaft
243,un,32,141,uni
244,uni,32,1944,uni


In [None]:
final_categories.to_csv("Final_Aspect_Categories.csv", index=False)
files.download("Final_Aspect_Categories.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Assign Categories to Aspects

In [None]:
import pandas as pd
from google.colab import files
final_categories = pd.read_csv("Final_Aspect_Categories.csv")

In [None]:
aspectList1 = pd.read_csv('PyABSA_FinalOutputWithoutCategory.csv')
finalList1 = aspectList1.merge(final_categories[['Aspect', 'Aspect_Category']], on='Aspect')
finalList1 = finalList1[['Category', 'Language', 'Aspect', 'Count', 'Aspect_Category', 'Negative', 'Neutral', 'Positive']]

finalList1.to_csv("PyABSA_FinalOutputWithCategory.csv", index=False)
files.download("PyABSA_FinalOutputWithCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
finalList1

Unnamed: 0,Category,Language,Aspect,Count,Aspect_Category,Negative,Neutral,Positive
0,PostDoc,English,system,69,wissenschaft,58,6,5
1,PostDoc,German,system,177,wissenschaft,151,12,14
2,PhDStudent,German,system,44,wissenschaft,34,5,5
3,Prof,English,system,33,wissenschaft,24,8,1
4,Prof,German,system,141,wissenschaft,121,8,12
...,...,...,...,...,...,...,...,...
377,Unknown,German,lehrer,11,professur,7,2,2
378,Unknown,German,beruf,11,projekt,6,3,2
379,Unknown,German,tag,11,mittelbau,4,4,3
380,Unknown,German,account,11,forschung,8,2,1


In [None]:
aspectList2 = pd.read_csv('Approch2_FinalOutputWithoutCategory.csv')
finalList2 = aspectList2.merge(final_categories[['Aspect', 'Aspect_Category']], on='Aspect')
finalList2 = finalList2[['Category', 'Language', 'Aspect', 'Count', 'Aspect_Category', 'Negative', 'Neutral', 'Positive']]

finalList2.to_csv("Approch2_FinalOutputWithCategory.csv", index=False)
files.download("Approch2_FinalOutputWithCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
finalList2

Unnamed: 0,Category,Language,Aspect,Count,Aspect_Category,Negative,Neutral,Positive
0,Others,English,academia,89,academia,42,29,18
1,PhD Student,English,academia,81,academia,47,11,23
2,PostDoc,English,academia,339,academia,200,63,76
3,Professor,English,academia,189,academia,122,24,43
4,Unknown,English,academia,110,academia,71,21,18
...,...,...,...,...,...,...,...,...
476,Professor,German,habil,62,hanna,43,11,8
477,Professor,German,liebe,55,hanna,25,16,14
478,Professor,German,punkt,90,stelle,43,30,17
479,Professor,German,sicherheit,56,grund,31,16,9


In [None]:
aspectList3 = pd.read_csv('Approch1_FinalOutputWithoutCategory.csv')
finalList3 = aspectList3.merge(final_categories[['Aspect', 'Aspect_Category']], on='Aspect')

finalList3.to_csv("Approch1_FinalOutputWithCategory.csv", index=False)
files.download("Approch1_FinalOutputWithCategory.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
finalList3

Unnamed: 0,Language,Category,Aspect,Count,Negative,Neutral,Positive,Negative_Probability,Neutral_Probability,Positive_Probability,Aspect_Category
0,English,Others,academia,89,36,43,10,0.431364,0.370752,0.197885,academia
1,English,PhD Student,academia,81,46,15,20,0.461662,0.309522,0.228817,academia
2,English,PostDoc,academia,339,135,164,40,0.435762,0.373692,0.190546,academia
3,English,Professor,academia,189,86,77,26,0.473850,0.344916,0.181235,academia
4,English,Unknown,academia,110,68,34,8,0.541797,0.318578,0.139625,academia
...,...,...,...,...,...,...,...,...,...,...,...
476,German,Professor,habil,62,35,24,3,0.470221,0.381689,0.148090,hanna
477,German,Professor,liebe,55,10,15,30,0.310330,0.333263,0.356407,hanna
478,German,Professor,punkt,90,21,47,22,0.285982,0.430072,0.283946,stelle
479,German,Professor,sicherheit,56,13,37,6,0.355706,0.438087,0.206207,grund
