In [1]:
from nltk.stem import WordNetLemmatizer
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Beck\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Beck\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Beck\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Beck\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Beck\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
import pandas as pd
import glob, os
from sklearn.feature_extraction.text import TfidfVectorizer
patent=pd.read_csv('./patent2021_01.csv', low_memory=False)
tfidf_vect=TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1,2), min_df = 0.05, max_df = 0.9)

feature_vect=tfidf_vect.fit_transform(patent['abstract'].values.astype('U'))



In [3]:
df=pd.DataFrame(feature_vect[0].T.todense(), index=tfidf_vect.get_feature_names(), columns=['TF-IDF'])
df=df.sort_values('TF-IDF', ascending=False)
print(df.head(25))

                     TF-IDF
material           0.478456
direction          0.476167
disposed           0.470082
plurality          0.373363
having             0.352412
includes           0.244487
power              0.000000
method             0.000000
processing         0.000000
process            0.000000
present invention  0.000000
method includes    0.000000
present            0.000000
position           0.000000
nan                0.000000
portion            0.000000
provided           0.000000
output             0.000000
operation          0.000000
network            0.000000
provide            0.000000
according          0.000000
layer              0.000000
structure          0.000000
using              0.000000


In [3]:
feature_vect[:10]

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [7]:
patent['abstract'][0]

'An agricultural implement having a rolling basket which includes a plurality of elongated members arranged in a substantially cylindrical shape. A stationary internal scraper is disposed in the inner chamber of the rolling basket and is adapted for breaking up a material from the inner chamber as the rolling basket is rotated around the major axis of rotation of the rolling basket. The internal scraper extends in an upward direction with respect to the major axis of rotation of the rolling basket with the upper edge of the internal scraper distally spaced away from and above the major axis of rotation of the rolling basket and with the lower edge of the internal scraper distally spaced away from and above the major axis of rotation of the rolling basket. The upper edge of the internal scraper is positioned above the lower edge.'

In [5]:
from keybert import KeyBERT
kw_model = KeyBERT()
kw_model.extract_keywords(patent['title'][0], keyphrase_ngram_range=(1, 1), stop_words='english', highlight=True)

[('scraper', 0.5153),
 ('rolling', 0.4206),
 ('agricultural', 0.4138),
 ('basket', 0.4097),
 ('implement', 0.215)]

In [6]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예제를 위해 동일한 클러스터링 결과 도출용 random_state=0 
km_cluster = KMeans(n_clusters=9, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [7]:
patent['cluster_label']=cluster_label
patent.head(20)

Unnamed: 0,id,date,title,abstract,cluster_label
0,10881042,2021-01-05,Agricultural implement with a scraper internal...,An agricultural implement having a rolling bas...,4
1,10881043,2021-01-05,Method for increasing the load on driving rear...,A method for increasing the load on driving re...,4
2,10881044,2021-01-05,Planter with full tandem offset pivot,An agricultural machine includes a frame havin...,0
3,10881045,2021-01-05,System and method for prescribing fertilizer a...,A precision agriculture prescription system wh...,7
4,10881046,2021-01-05,Agricultural liquid fertilizer and chemical de...,The present invention relates to a system that...,2
5,10881047,2021-01-05,Robot mower with protruding blades,A robot mower includes a left cutter blade uni...,0
6,10881048,2021-01-05,Oscillating spreading arrangement for a combin...,A combine harvester includes a straw chopper h...,0
7,10881049,2021-01-05,Guiding device and baler,"An agricultural baler includes a frame, a pres...",4
8,10881050,2021-01-05,Baler stuffer selectively operable to provide ...,An agricultural baler includes a baler chassis...,4
9,10881051,2021-01-05,Fluid-cooled LED-based lighting methods and ap...,A fluid-cooled LED-based lighting fixture for ...,4


In [10]:
patent[patent['cluster_label']==0].sort_values(by='title')


Unnamed: 0,id,date,title,abstract,cluster_label
251614,11134813,2021-10-05,2-in-1 nail lamp station,A nail lamp station includes a base lamp assem...,0
192778,11075509,2021-07-27,2-shot molded vapor seal,An electrical box including a flange produced ...,0
251864,11135065,2021-10-05,3-D printed orthopedic implants,A spinal interbody implant is fabricated using...,0
295397,11178945,2021-11-23,360 degree rotatable accessory,Provided is a 360-degree rotatable accessory. ...,0
45523,10926932,2021-02-23,3D flexible bag to be filled for biopharmaceut...,A 3-D flexible bag to be filled with a biophar...,0
...,...,...,...,...,...
185115,11067779,2021-07-20,Zoom lens and image pickup apparatus including...,Provided is a zoom lens including a plurality ...,0
229575,11112588,2021-09-07,Zoom lens and imaging apparatus,"A zoom lens is constituted by, in order from t...",0
325790,11209610,2021-12-28,Zoom lens and imaging apparatus,The zoom lens consists of a positive first len...,0
54293,10935772,2021-03-02,Zoom lens and imaging apparatus,A zoom lens including a front group and a rear...,0


In [11]:
patent[patent['cluster_label']==1].sort_values(by='title')

Unnamed: 0,id,date,title,abstract,cluster_label
331274,D908803,2021-01-26,100 sided die,,1
335103,D912634,2021-03-09,18-key remote control with LED driver,,1
338020,D915551,2021-04-06,2-in-1 water sprinkling can,,1
335492,D913023,2021-03-16,3 tier riser,,1
353518,D931057,2021-09-21,3 tier spice rack,,1
...,...,...,...,...,...
343995,D921531,2021-06-08,Zipper,,1
345234,D922770,2021-06-22,Zipper,,1
334141,D911672,2021-03-02,Zipper jumpsuit,,1
330000,D907529,2021-01-12,Zipper puller,,1


In [12]:
patent[patent['cluster_label']==2].sort_values(by='title')

Unnamed: 0,id,date,title,abstract,cluster_label
289675,11173184,2021-11-16,Bacillus subtilis strain with probiotic activity,The current invention concerns a new B. subtil...,2
259232,11142502,2021-10-12,(+)-morphinans as antagonists of Toll-like rec...,The present invention provides (+)-morphinans ...,2
151182,11033533,2021-06-15,"(2R,4R)-5-(5′-chloro-2′-fluorobiphenyl-4-yl)-2...","In one aspect, the invention relates to a comp...",2
214030,11096926,2021-08-24,"(3AR)-1,3A,8-trimethyl-1,2,3,3A,8,8A-hexahydro...","The invention includes an amount of (3aR)-1,3a...",2
264494,11147814,2021-10-19,"(4-((3R,4R)-3-methoxytetrahydro-pyran-4-ylamin...",The invention provides a salt of a tetrahydrop...,2
...,...,...,...,...,...
79568,10961268,2021-03-30,β-nicotinate ester nucleotides and process for...,The invention provides a compound of formula (...,2
297270,11180830,2021-11-23,"γ, γ′ cobalt based alloys for additive manufac...","The invention relates to gamma, gamma'-cobalt-...",2
273389,11156759,2021-10-26,"μ-LED, μ-LED device, display and method for th...",The invention relates to various aspects of a ...,2
215216,11098121,2021-08-24,“Immune checkpoint intervention” in cancer,The present invention relates to methods for i...,2


In [13]:
patent[patent['cluster_label']==3].sort_values(by='title')

Unnamed: 0,id,date,title,abstract,cluster_label
83085,10964817,2021-03-30,(110) surface orientation for reducing fermi-l...,"A device with improved device performance, and...",3
321977,11205750,2021-12-21,1S1R memory integrated structure with larger s...,The present disclosure provides a 1S1R memory ...,3
269120,11152471,2021-10-19,2-dimensional electron gas and 2-dimensional h...,Semiconductor devices including a first region...,3
304258,11187871,2021-11-30,"2D bi-pod flexure design, mount technique and ...",A bipod flexure mount couples an optic to a ba...,3
12658,10893796,2021-01-19,2D multi-layer thickness measurement,A method for measuring layer thicknesses of a ...,3
...,...,...,...,...,...
253437,11136651,2021-10-05,Zn-Mg alloy plated steel material having excel...,Provided is a Zn—Mg alloy plated steel materia...,3
32704,10913994,2021-02-09,Zn—Al—Mg-based plated steel sheet,"A Zn—Al—Mg-based plated steel sheet has, an al...",3
311868,11195548,2021-12-07,Zoned block command to stream command translator,A method for performing an operation of a memo...,3
207856,11090716,2021-08-17,ϵ-iron oxide type ferromagnetic powder and mag...,Provided is an ε-iron oxide type ferromagnetic...,3


In [8]:
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화 
km_cluster = KMeans(n_clusters=4, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_


# 소속 클러스터를 cluster_label 컬럼으로 할당하고 cluster_label 값으로 정렬
patent['cluster_label'] = cluster_label
patent.sort_values(by='cluster_label')

Unnamed: 0,id,date,title,abstract,cluster_label
277100,11160495,2021-11-02,Sleep apnea monitoring system,The present invention relates to a sleep apnea...,0
25602,10906835,2021-02-02,"Glass compositions, fiberizable glass composit...",The present invention relates generally to gla...,0
247345,11130509,2021-09-28,System and method for detecting a break in a r...,The present invention refers to a method for d...,0
25585,10906818,2021-02-02,Process for back-and-forth washing of adsorpti...,The invention provides methods and systems for...,0
247328,11130492,2021-09-28,"Vehicle control device, vehicle control method...",A vehicle control device includes a recognizer...,0
...,...,...,...,...,...
340661,D918193,2021-05-04,Electronic device,,3
340660,D918192,2021-05-04,Kiosk,,3
340659,D918191,2021-05-04,Aircraft avionics console,,3
340673,D918205,2021-05-04,Mouse,,3


In [9]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape :',cluster_centers.shape)
print(cluster_centers)

cluster_centers shape : (4, 83)
[[2.90418346e-02 4.72873943e-02 1.42370260e-02 7.86268932e-03
  1.17867013e-02 1.38635449e-02 1.19451608e-02 3.79804974e-03
  8.81304159e-03 1.76930814e-02 3.31305173e-02 9.94935606e-02
  6.97847232e-03 9.37388230e-03 1.28469946e-02 6.06249016e-03
  3.97275321e-03 8.46202736e-03 1.00018600e-02 3.11721964e-03
  7.94388779e-03 3.49118098e-02 8.32190141e-04 1.03215433e-02
  7.14295681e-03 1.47793773e-02 6.26897774e-02 5.91728692e-03
  4.42709265e-03 1.28470062e-02 2.99142912e-02 9.02288685e-03
  1.15506584e-02 2.34769835e-02 1.11304636e-02 2.89675097e-02
  4.47512585e-02 6.58979672e-03 1.28761866e-02 1.70765377e-02
  3.02766882e-02 8.91019908e-03 2.23946961e-03 3.43546932e-01
  1.26979385e-02 3.46436900e-02 1.35395604e-01 3.21505350e-03
  1.51690698e-05 5.14848878e-03 5.74888435e-03 2.93661305e-03
  1.07349502e-02 8.06474588e-03 9.08232540e-03 7.03305505e-03
  2.41244913e-01 2.21111736e-01 4.13770894e-02 5.50650978e-03
  2.02969564e-02 3.05922875e-02 9.7358

In [40]:
# 군집별 top n 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명들을 반환함. 
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}
    
    # cluster_centers array 의 값이 큰 순으로 정렬된 index 값을 반환
    # 군집 중심점(centroid)별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함.  
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    #개별 군집별로 iteration하면서 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명 입력
    for cluster_num in range(clusters_num):
        # 개별 군집별 정보를 담을 데이터 초기화. 
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        # cluster_centers_.argsort()[:,::-1] 로 구한 index 를 이용하여 top n 피처 단어를 구함. 
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [ feature_names[ind] for ind in top_feature_indexes ]
        
        # top_feature_indexes를 이용해 해당 피처 단어의 중심 위치 상댓값 구함 
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()
        
        # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심위치 상대값, 그리고 해당 파일명 입력
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        title = cluster_data[cluster_data['cluster_label'] == cluster_num]['title']
        title = title.values.tolist()
        cluster_details[cluster_num]['title'] = title
        
    return cluster_details

In [41]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('####### Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
        print('Reviews 제목 :',cluster_detail['title'][:7])
        print('==================================================')

In [43]:
feature_names = tfidf_vect.get_feature_names()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=patent,\
                                  feature_names=feature_names, clusters_num=4, top_n_features=20 )
print_cluster_details(cluster_details)

####### Cluster 0
Top features: ['invention', 'present', 'present invention', 'relates', 'method', 'comprising', 'provides', 'use', 'disclosure', 'andor', 'using', 'having', 'process', 'device', 'material', 'comprises', 'used', 'provided', 'including', 'embodiment']
Reviews 제목 : ['Agricultural liquid fertilizer and chemical delivery system and method of use', 'Cooling and condensation device for a greenhouse', 'Nutritionally and botanically enhanced mycelial mass', 'Breeding method for tetraploid Ricinus communis', 'Endophytes and related methods', 'Hybrid squash plant named tribute', 'Soybean variety 01072782']
####### Cluster 1
Top features: ['device', 'data', 'method', 'user', 'information', 'based', 'signal', 'image', 'includes', 'control', 'plurality', 'unit', 'configured', 'network', 'provided', 'second', 'using', 'communication', 'value', 'set']
Reviews 제목 : ['Method for increasing the load on driving rear wheels of tractors during soil cultivation', 'System and method for presc

In [10]:
from keybert import KeyBERT
import collections.abc
from collections.abc import MutableMapping



In [23]:
def BERT(label):

    array_text = pd.DataFrame(patent[patent['cluster_label'] == label]['title']).to_numpy()

    bow = []
    from keybert import KeyBERT
    kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
    for j in range(len(array_text)):
        keywords = kw_extractor.extract_keywords(array_text[j][0])
        bow.append(keywords)
    
    new_bow = []
    for i in range(0, len(bow)):
        for j in range(len(bow[i])):
            new_bow.append(bow[i][j])
            
            
    keyword = pd.DataFrame(new_bow, columns=['keyword', 'weight'])
    return keyword.groupby('keyword').agg('sum').sort_values('weight', ascending=False)

In [24]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12165742084834304620,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 21795504128
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 4921901873796321828
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:0a:00.0, compute capability: 8.6"]

In [25]:
b0=BERT(0)

In [27]:
b0

Unnamed: 0_level_0,weight
keyword,Unnamed: 1_level_1
method,1470.4924
methods,954.8962
device,767.4540
apparatus,502.9318
compositions,476.2132
...,...
bolus,0.1099
aie,0.1040
increases,0.0944
let,0.0580


In [32]:
b0['cluster_label']=0

In [33]:
b0

Unnamed: 0_level_0,weight,cluster_label
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
method,1470.4924,0
methods,954.8962,0
device,767.4540,0
apparatus,502.9318,0
compositions,476.2132,0
...,...,...
bolus,0.1099,0
aie,0.1040,0
increases,0.0944,0
let,0.0580,0


In [28]:
b1=BERT(1)

In [29]:
b1

Unnamed: 0_level_0,weight
keyword,Unnamed: 1_level_1
method,16669.5827
device,14444.1257
apparatus,11849.5854
methods,5985.3599
data,5642.5301
...,...
iort,0.1230
pvt1,0.1198
spoilers,0.1113
midrib,0.1069


In [34]:
b1['cluster_label']=1
b1

Unnamed: 0_level_0,weight,cluster_label
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
method,16669.5827,1
device,14444.1257,1
apparatus,11849.5854,1
methods,5985.3599,1
data,5642.5301,1
...,...,...
iort,0.1230,1
pvt1,0.1198,1
spoilers,0.1113,1
midrib,0.1069,1


In [30]:
b2=BERT(2)

In [31]:
b2

Unnamed: 0_level_0,weight
keyword,Unnamed: 1_level_1
device,12150.3457
method,8012.1922
apparatus,6089.9360
manufacturing,4704.8685
semiconductor,4095.5949
...,...
flowably,0.1590
pyrazino,0.1517
crocin,0.1394
conscious,0.1067


In [35]:
b2['cluster_label']=2
b2

Unnamed: 0_level_0,weight,cluster_label
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
device,12150.3457,2
method,8012.1922,2
apparatus,6089.9360,2
manufacturing,4704.8685,2
semiconductor,4095.5949,2
...,...,...
flowably,0.1590,2
pyrazino,0.1517,2
crocin,0.1394,2
conscious,0.1067,2


In [42]:
final=pd.concat([b0,b1,b2])
final.head(20)

Unnamed: 0_level_0,weight,cluster_label
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
method,1470.4924,0
methods,954.8962,0
device,767.454,0
apparatus,502.9318,0
compositions,476.2132,0
composition,424.056,0
treatment,414.5719,0
compounds,351.8426,0
thereof,296.7578,0
preparation,293.9982,0


In [43]:
final.to_csv('final.csv')