In [1]:
import numpy as np
import pandas as pd

import umap
import hdbscan
import pprint

from sentence_transformers import SentenceTransformer
import torch

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('punkt')
from nltk import tokenize

[nltk_data] Downloading package punkt to /home1/kasee0/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import transformers

In [3]:
import pickle
with open('./filtered_Main.pkl', 'rb') as f:
    data1=pickle.load(f)
with open('./filtered_Methods.pkl', 'rb') as f2:
    data2=pickle.load(f2)
with open('./filtered_Sum.pkl', 'rb') as f3:
    data3=pickle.load(f3)

In [4]:
# Sentence transformer를 사용해서 Word embedding model을 sentence embedding model로 만드는 방법
from sentence_transformers import SentenceTransformer, models
from torch import nn

word_embedding_model = models.Transformer('allenai/scibert_scivocab_uncased', max_seq_length=512)

pooling_model=models.Pooling(word_embedding_model.get_word_embedding_dimension(),
pooling_mode_max_tokens=False,
pooling_mode_mean_tokens=False,
pooling_mode_mean_sqrt_len_tokens=True)

dense_model= models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(),out_features=768,activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
embeddings1 = model.encode(data3, convert_to_tensor=True)
embeddings2 = model.encode(data1, convert_to_tensor=True)
embeddings3 = model.encode(data2, convert_to_tensor=True)
embeddings4=model.encode(data1+data2+data3, convert_to_tensor=True)

In [6]:
umap_neighbors = 12 # local 값 숫자를 줄일 수록 local에 집중
umap_n_components = 12

umap_embeddings_12d = umap.UMAP(n_neighbors=umap_neighbors,
                                n_components=umap_n_components,
                                n_epochs=14000,
                                min_dist=0.1,
                                low_memory=False,
                                learning_rate=0.5,
                                verbose=True,
                                metric='cosine',
                                spread=3.0,
                                local_connectivity=2,
                                target_metric='l2').fit_transform(embeddings1.cpu())

UMAP(angular_rp_forest=True, learning_rate=0.5, local_connectivity=2, low_memory=False, metric='cosine', n_components=12, n_epochs=14000, n_neighbors=12, spread=3.0, target_metric='l2', verbose=True)
Wed Aug 31 11:23:31 2022 Construct fuzzy simplicial set
Wed Aug 31 11:23:32 2022 Finding Nearest Neighbors
Wed Aug 31 11:23:34 2022 Finished Nearest Neighbor Search
Wed Aug 31 11:23:37 2022 Construct embedding


Epochs completed:   0%|            0/14000 [00:00]

Wed Aug 31 11:24:26 2022 Finished embedding


In [7]:
%%time

hdbscan_minimum_cluster_size = 55
hdbscan_min_samples = 2

cluster = hdbscan.HDBSCAN(min_cluster_size=hdbscan_minimum_cluster_size,
                          min_samples=hdbscan_min_samples,
                          metric='euclidean',
                          cluster_selection_epsilon=0.1,
                          cluster_selection_method='leaf',
                          leaf_size=40,
                          algorithm='best').fit(umap_embeddings_12d)

analyze_clusters= len(pd.Series(cluster.labels_).unique())

df = pd.DataFrame(pd.DataFrame(cluster.labels_).value_counts())

with pd.option_context('display.max_rows', 30, 'display.min_rows', 10, 'display.max_columns',10, 'display.width', 100):
    print(df.head(20))

      0
0      
-1  317
 6  111
 3  104
 1   85
 2   81
 5   79
 4   73
 0   70
CPU times: user 35.3 ms, sys: 2.03 ms, total: 37.3 ms
Wall time: 56.8 ms


In [8]:
umap_neighbors = 12 # local 값 숫자를 줄일 수록 local에 집중
umap_n_components = 3

umap_data_3d = umap.UMAP(n_neighbors=umap_neighbors,
                         n_components=umap_n_components,
                         n_epochs=14000,
                         min_dist=0.1,
                         low_memory=False,
                         learning_rate=0.1,
                         verbose=True,
                         metric='cosine',
                         spread=3.0,
                         local_connectivity=2,
                         target_metric='l2').fit_transform(embeddings1.cpu())

UMAP(angular_rp_forest=True, learning_rate=0.1, local_connectivity=2, low_memory=False, metric='cosine', n_components=3, n_epochs=14000, n_neighbors=12, spread=3.0, target_metric='l2', verbose=True)
Wed Aug 31 11:24:26 2022 Construct fuzzy simplicial set
Wed Aug 31 11:24:27 2022 Finding Nearest Neighbors
Wed Aug 31 11:24:27 2022 Finished Nearest Neighbor Search
Wed Aug 31 11:24:27 2022 Construct embedding


Epochs completed:   0%|            0/14000 [00:00]

Wed Aug 31 11:25:15 2022 Finished embedding


In [9]:
result3d = pd.DataFrame(umap_data_3d, columns=['x', 'y', 'z'])
result3d['sentence'] = data3
result3d['labels'] = cluster.labels_

with pd.option_context('display.max_rows', 30, 'display.min_rows', 10, 'display.max_columns',10, 'display.width', 200,'display.max_colwidth', 100):
    print(result3d.head(20))

            x          y          z                                                                                             sentence  labels
0    8.310966   6.240478  14.725656  RRAM characteristics by measuring more 50 randomly picked devices in a new Cr/GdOx/TiN structure...      -1
1    5.871776   6.644083  11.878378  Resistive switching behaviors in bio‐organic aloe polysaccharides films deposited from a solutio...      -1
2    5.830981   6.681397  11.923270  Resistive switching behaviors in bio‐organic aloe polysaccharides films deposited from a solutio...      -1
3    8.286065   6.269715  14.741472  RRAM characteristics by measuring more 50 randomly picked devices in a new Cr/GdOx/TiN structure...      -1
4    5.428813  11.486077  12.023021  Excellent reliability after 100 k cycles and 10 years’ retention at 85 °C after 10 k cycles were...      -1
5   11.549830  10.361985  11.782401  In conclusion, the C and the R of the TaOx thin film are measured as the reset process progre

In [10]:
docs_df = pd.DataFrame(data3, columns=['Doc'])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index=False).agg({'Doc': ' '.join})

In [11]:
vectorizer = CountVectorizer(ngram_range=(2,3), stop_words='english')
count = vectorizer.fit_transform(docs_per_topic.Doc.values.tolist())

In [12]:
t = count.toarray()
w = t.sum(axis=1)

tf = np.divide(t.T, w)
sum_t = t.sum(axis=0)

m=len(data3)
idf = np.log(np.divide(m,sum_t)).reshape(-1,1)

In [13]:
tf_idf = np.multiply(tf,idf)
tf_idf = np.array(tf_idf, dtype=np.float32)

In [14]:
# define TOP words in word cluster

n = 8 # display top n words per topic

words = vectorizer.get_feature_names()
labels = list(docs_per_topic.Topic)

tf_idf_transposed = tf_idf.T
indices = tf_idf_transposed.argsort()[:, -n:]

tf = []
top_n_words = {}

def Sort_Tuple(tup):
    tup.sort(key=lambda x: x[1])
    return tup

for i, label in enumerate(labels):
    tt=[]
    for j in indices[i]:
        if tf_idf_transposed[i][j] != 0:
            tt.append((words[j], tf_idf_transposed[i][j]))
        top_n_words.update({label: Sort_Tuple(tt)[::-1]})

In [15]:
def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "labels", "Doc": "magnitude"}, axis='columns')
                     .sort_values(by='magnitude', ascending=False))
    return topic_sizes

topic_sizes = extract_topic_sizes(docs_df)
topic_sizes

Unnamed: 0,labels,magnitude
0,-1,317
7,6,111
4,3,104
2,1,85
3,2,81
6,5,79
5,4,73
1,0,70


In [16]:
answer2 = []
for i in topic_sizes.labels:
    if i != (-1):
        xx = []
        for x in top_n_words[i][:50]: # top 50 words
            xx.append(x[0])
        listToStr = ' - '.join(map(str,xx))
        answer = (i, listToStr)
        answer2.append(answer)
print('Answer 2 = ', answer2)

Answer 2 =  [(6, 'oxygen vacancies - resistance switching - oxygen vacancy - activation energy - forming process - oxygen atoms - resistive switching - oxygen vacancy concentration'), (3, 'resistive switching - memory devices - bipolar resistive switching - bipolar resistive - switching behavior - fju 23 - self assembly - resistive switching behavior'), (1, 'memory devices - electric field - electronic devices - room temperature - dynamic range - mg ln - protein based - bio integrated'), (2, 'resistive switching - resistance ratio - resistance switching - rs behavior - switching behaviors - pt device - good retention - oxygen vacancies'), (5, 'amorphous structure - set reset - oxygen vacancies - amorphous chalcogenides - highly conductive - pedot nafion - brominated fibers - ots selector'), (4, 'ge0 3se0 - characteristic breakdown - critical nucleus - breakdown time - voltage stress - characteristic breakdown time - phase change materials - change materials'), (0, 'al al2o3 - oxygen va

In [17]:
answer3=[]
for i in topic_sizes.labels:
    #if i != (-1):
    xx = []
    for x in top_n_words[i][:50]: # top 50 words
        xx.append(x[0]) 
    listToStr2 = ' - '.join(map(str,xx)) 
    answer22 = (i, listToStr2)
    answer3.append(answer22)
print('Answer 3 = ', answer3)

Answer 3 =  [(-1, 'resistive switching - memory devices - nonvolatile memory - et al - aloe polysaccharides - switching memory - cross point - resistance switching'), (6, 'oxygen vacancies - resistance switching - oxygen vacancy - activation energy - forming process - oxygen atoms - resistive switching - oxygen vacancy concentration'), (3, 'resistive switching - memory devices - bipolar resistive switching - bipolar resistive - switching behavior - fju 23 - self assembly - resistive switching behavior'), (1, 'memory devices - electric field - electronic devices - room temperature - dynamic range - mg ln - protein based - bio integrated'), (2, 'resistive switching - resistance ratio - resistance switching - rs behavior - switching behaviors - pt device - good retention - oxygen vacancies'), (5, 'amorphous structure - set reset - oxygen vacancies - amorphous chalcogenides - highly conductive - pedot nafion - brominated fibers - ots selector'), (4, 'ge0 3se0 - characteristic breakdown - c

In [18]:
wordcluster2 = dict(answer3)
words2 = pd.DataFrame.from_dict(wordcluster2, orient='index', columns=['Top10_words'])
words2['labels'] = words2.index
new_df2 = pd.merge(topic_sizes, words2, left_on='labels', right_on='labels')

with pd.option_context('display.max_rows', 30, 'display.min_rows', 10, 'display.max_columns',5, 'display.width', 200,'display.max_colwidth', 100):
    print('\nTOP words for each identified cluster: \n\n', new_df2)


TOP words for each identified cluster: 

    labels  magnitude                                                                                          Top10_words
0      -1        317  resistive switching - memory devices - nonvolatile memory - et al - aloe polysaccharides - switc...
1       6        111  oxygen vacancies - resistance switching - oxygen vacancy - activation energy - forming process -...
2       3        104  resistive switching - memory devices - bipolar resistive switching - bipolar resistive - switchi...
3       1         85  memory devices - electric field - electronic devices - room temperature - dynamic range - mg ln ...
4       2         81  resistive switching - resistance ratio - resistance switching - rs behavior - switching behavior...
5       5         79  amorphous structure - set reset - oxygen vacancies - amorphous chalcogenides - highly conductive...
6       4         73  ge0 3se0 - characteristic breakdown - critical nucleus - breakdown time - voltage

In [19]:
outliers3d = result3d.loc[result3d.labels == -1, :]
clustered3d = result3d.loc[result3d.labels != -1, :]

final_df = pd.merge(clustered3d, new_df2, left_on=['labels'], right_on=['labels'])
print(final_df.info())
final_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 603 entries, 0 to 602
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   x            603 non-null    float32
 1   y            603 non-null    float32
 2   z            603 non-null    float32
 3   sentence     603 non-null    object 
 4   labels       603 non-null    int64  
 5   magnitude    603 non-null    int64  
 6   Top10_words  603 non-null    object 
dtypes: float32(3), int64(2), object(2)
memory usage: 30.6+ KB
None


Unnamed: 0,x,y,z,sentence,labels,magnitude,Top10_words
0,11.549830,10.361985,11.782401,"In conclusion, the C and the R of the TaOx thi...",5,79,amorphous structure - set reset - oxygen vacan...
1,10.020789,10.183317,9.492759,A self-assembled 2D finned CuS was synthesized...,5,79,amorphous structure - set reset - oxygen vacan...
2,12.716655,9.960546,10.801212,Resistive switching was observed in the planar...,5,79,amorphous structure - set reset - oxygen vacan...
3,9.928867,8.912595,12.355476,"In conclusion, the electrical properties of CN...",5,79,amorphous structure - set reset - oxygen vacan...
4,10.837996,10.529210,9.483153,"In summary, we developed a picture describing ...",5,79,amorphous structure - set reset - oxygen vacan...
...,...,...,...,...,...,...,...
598,11.754639,7.972859,16.769522,"In summary, the negative-set resistive switchi...",0,70,al al2o3 - oxygen vacancies - al2o3 interface ...
599,13.730749,7.333508,14.513068,"In this work, NiO thin films were fabricated b...",0,70,al al2o3 - oxygen vacancies - al2o3 interface ...
600,12.622646,6.146099,17.308081,"In conclusion, we have fabricated amorphous a-...",0,70,al al2o3 - oxygen vacancies - al2o3 interface ...
601,13.011495,6.789479,13.838529,Figure 4(a) shows the typical switching curren...,0,70,al al2o3 - oxygen vacancies - al2o3 interface ...


In [20]:
final_df_noise = pd.merge(outliers3d, new_df2, left_on=['labels'], right_on=['labels'])
final_df_noise

Unnamed: 0,x,y,z,sentence,labels,magnitude,Top10_words
0,8.310966,6.240478,14.725656,RRAM characteristics by measuring more 50 rand...,-1,317,resistive switching - memory devices - nonvola...
1,5.871776,6.644083,11.878378,Resistive switching behaviors in bio‐organic a...,-1,317,resistive switching - memory devices - nonvola...
2,5.830981,6.681397,11.923270,Resistive switching behaviors in bio‐organic a...,-1,317,resistive switching - memory devices - nonvola...
3,8.286065,6.269715,14.741472,RRAM characteristics by measuring more 50 rand...,-1,317,resistive switching - memory devices - nonvola...
4,5.428813,11.486077,12.023021,Excellent reliability after 100 k cycles and 1...,-1,317,resistive switching - memory devices - nonvola...
...,...,...,...,...,...,...,...
312,11.583512,5.157137,12.687058,"Typical I−V characteristics of (a) UV-ﬁlms, (b...",-1,317,resistive switching - memory devices - nonvola...
313,11.600218,5.179971,12.726956,a) Distribution of the programmed resistance o...,-1,317,resistive switching - memory devices - nonvola...
314,11.655113,5.147994,12.783219,(a) I−V curves plotted by SCLC in HRS of 400 °...,-1,317,resistive switching - memory devices - nonvola...
315,11.616447,5.238998,12.683295,(a) Cross-sectional SEM images of a-IGZO Non-U...,-1,317,resistive switching - memory devices - nonvola...


In [21]:
# start pandas final_df for labels :

final_df_sorted_labels = final_df.sort_values(by=['labels'], ascending=True)
my_final_df_sorted_labels = final_df_sorted_labels.loc[final_df_sorted_labels['labels']==2]
my_legend_sorted = (final_df_sorted_labels[['labels','magnitude','Top10_words']].drop_duplicates().sort_values(by=['labels'], ascending=True)).style.hide_index()

my_legend2 = (final_df_sorted_labels[['labels','magnitude','Top10_words']].drop_duplicates().sort_values(by=['labels'], ascending=True))
df1 = my_legend2.astype('string')
df1['new']=df1[['labels','magnitude','Top10_words']].agg(' - '.join, axis=1)

legend_list8 = df1['new'].to_list()

df1['magnitude'].astype('int').sum()



603

In [26]:
from collections import Counter

class my_ngrams:
    def __init__(self, words_list, dim_n):
        self.words_list = words_list
        self.dim_n = dim_n

    def generate_ngrams(self):
        ngrams = zip(*[self.words_list[i:] for i in range(self.dim_n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngrams_counter(self):
        my_ngrams_counter = Counter(self.generate_ngrams())
        most_common_my_ngram = my_ngrams_counter.most_common()
        most_common_my_ngram = pd.DataFrame(most_common_my_ngram)
        most_common_my_ngram.columns = ['word','freq']
        most_common_my_ngram['percentage'] = most_common_my_ngram.freq *100 / sum(most_common_my_ngram.freq)
        return most_common_my_ngram

#for t in[0,1,2,3]:
#    my_words2 = final_df_sorted_labels.sentence.loc[final_df_sorted_labels['labels']==t].apply(lambda x: x.split()).tolist()
#    my_words_list2 = [item for sublist in my_words2 for item in sublist]
#    my_words_counter2 = Counter(my_words_list2)
#    gla = my_ngrams(my_words_list2, nn).ngrams_counter()

my_words2 = final_df_sorted_labels.sentence.apply(lambda x: x.split()).tolist()
my_words_list2 = [item for sublist in my_words2 for item in sublist]
my_words_counter2 = Counter(my_words_list2)
nn=3
gla = my_ngrams(my_words_list2, nn).ngrams_counter()
gla.head(12).style.format({'percentage': "{:.3f}%"}).bar(color='#FFA07A', vmin=10, subset=['freq'], align='zero').highlight_max(axis=0,color='lightgreen')

Unnamed: 0,word,freq,percentage
0,"In summary, we",95,0.123%
1,"In conclusion, we",58,0.075%
2,the resistive switching,55,0.071%
3,the formation of,46,0.059%
4,due to the,41,0.053%
5,"summary, we have",38,0.049%
6,the number of,38,0.049%
7,"In summary, the",36,0.046%
8,of oxygen vacancies,36,0.046%
9,attributed to the,34,0.044%


In [23]:
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = 'iframe'

In [71]:
fig = go.Figure(go.Scatter3d(x=final_df_noise.x,
                             y=final_df_noise.y,
                             z=final_df_noise.z,
                             mode='text+markers',
                             text=final_df_noise.labels,
                             textfont=dict(size=9,family="Arial",color ='rgba(128,128,128,0.0)'),
                             hoverinfo='text',
                             hovertext= final_df_noise.sentence,
                             name = 'NOISE - no activ cluster allocation of these sentences',
                             textposition='middle right',
                             marker=dict(symbol='circle',
                                         sizemin=13,
                                         size=np.array(final_df_noise.magnitude),
                                         sizeref=1400,
                                         sizemode = 'diameter',
                                         color='lightgray'),showlegend=True)
                                         )

for i in final_df['labels'].unique():
    fig.add_trace(go.Scatter3d(x=final_df[final_df.labels == i].x,
                           y=final_df[final_df.labels == i].y,
                           z=final_df[final_df.labels == i].z,
                           mode='text+markers',
                           text=final_df[final_df.labels == i].labels,
                           textfont=dict(size=9,family="Arial",color ='rgba(128,128,128,0.0)'),
                           hoverinfo='text',
                           hovertext= final_df[final_df.labels == i].sentence,
                           name = final_df[final_df.labels == i].Top10_words.values[0],
                           textposition='middle right',
                           marker=dict(symbol='circle',
                                        sizemin=13,
                                        size=np.array(final_df.magnitude),
                                        sizeref=1400,
                                        sizemode = 'diameter',
                                        color=final_df[final_df.labels == i].labels.values[0]),showlegend=True)
                                        )
    
fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False )

fig.update_layout(width=1600, height=1000,legend_orientation="h",margin=dict(l=0, r=0, b=0, t=0))
#fig.update_layout(title_font_family='Arial Nova',
#font=dict(family='Arial Nova', size=11, color='rgb(255,239,15)'),
#bordercolor=None,
#borderwidth=0,
#bgcolor='rgba(0,0,0,0.0)',
#legend_title="",
#scene=dict(xaxis=dict(axis),
#yaxis=dict(axis),
#zaxis=dict(axis), bgcolor='rgb(135,21,255)'),
#title={'text': '3D Clusters of pargraph','y':0.97,'x':0.1,'xanchor': 'center','yanchor': 'top'},
#font_family='Arial Nova Light',
#font_size=12,
#font_color='blue',
#title_font_color='rgb(255,239,15)',
#title_font_size=25,
#legend_title_font_color="gray")
fig.show()

In [86]:
final_df_noise.sentence

0      RRAM characteristics by measuring more 50 rand...
1      Resistive switching behaviors in bio‐organic a...
2      Resistive switching behaviors in bio‐organic a...
3      RRAM characteristics by measuring more 50 rand...
4      Excellent reliability after 100 k cycles and 1...
                             ...                        
312    Typical I−V characteristics of (a) UV-ﬁlms, (b...
313    a) Distribution of the programmed resistance o...
314    (a) I−V curves plotted by SCLC in HRS of 400 °...
315    (a) Cross-sectional SEM images of a-IGZO Non-U...
316    Amorphous KN ﬁlms were grown on a TiN−Si subst...
Name: sentence, Length: 317, dtype: object