In [70]:
from docx import Document
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\johnt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def read_doc(file_path, patent_start=r'^\s*US[0-9A-Z]*$'):
    doc = Document(file_path)
    labels = []
    patents = []
    current_patent_text = []
    
    for i, paragraph in enumerate(doc.paragraphs):
        if re.search(patent_start, paragraph.text):
            labels.append(paragraph.text)
            if current_patent_text:
                patents.append('\n'.join(current_patent_text))
            current_patent_text = []
        else:
            # If not a patent start, add the paragraph text to the current patent text
            current_patent_text.append(paragraph.text)
            
    # Add the last patent text
    if current_patent_text:
        patents.append('\n'.join(current_patent_text))

    return patents, labels

In [43]:
def preprocess_text(text):
    stop_words = stopwords.words('english')
    stop_words.append('may')
    stop_words = set(stop_words)
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [44]:
def calculate_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

In [45]:
def extract_keywords(tfidf_matrix, feature_names, top_n=10):
    document_tfidf = tfidf_matrix[0].toarray()[0]
    keywords = [feature_names[i] for i in document_tfidf.argsort()[-top_n:][::-1]]
    return keywords

In [46]:
patents, labels = read_doc(r'C:\Users\johnt\Downloads\20240106021625210-AMD (2).docx')

In [47]:
for i in range(len(patents)):
    patents[i] = preprocess_text(patents[i])

In [48]:
label2pat = dict(zip(labels, patents))

In [49]:
patent_keywords = {}

In [50]:
for label, patent in label2pat.items():
    X, feature_names = calculate_tfidf([patent])
    keywords = extract_keywords(X, feature_names)
    
    patent_keywords[label] = keywords

In [51]:
word_set = set(word for words_list in patent_keywords.values() for word in words_list)

In [52]:
word_count = Counter(word for words_list in patent_keywords.values() for word in words_list)

In [104]:
keyword2patent = {}
for word in word_set:
    patent_list = []
    
    for patent, kw_list in patent_keywords.items():
        if word in kw_list:
            patent_list.append(patent)
    
    patent_str = ', '.join(patent_list)
    keyword2patent[word] = patent_str

In [105]:
patent_df = pd.DataFrame(patent_keywords)
keyword2patent = list(keyword2patent.items())
keyword_df = pd.DataFrame(keyword2patent, columns=["Keyword", "Patents"])

In [106]:
patent_df = patent_df.T

In [109]:
keyword_df.set_index('Keyword', inplace=True)

In [110]:
patent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
US8053849B2,layer,gate,metal,oxide,dielectric,carbon,invention,amorphous,present,electrode
US8753943B2,gate,layer,metal,dielectric,oxide,invention,present,electrode,carbon,amorphous
US8445975B2,gate,layer,metal,dielectric,oxide,present,invention,electrode,carbon,silicon
US8489898B2,samu,code,data,secure,kernel,processor,application,memory,boot,key
US8904190B2,samu,code,secure,processor,kernel,data,memory,boot,sensitive,aes
...,...,...,...,...,...,...,...,...,...,...
US10354365B2,image,sampling,first,one,pattern,embodiment,second,pixel,processor,lines
US10956044B2,memory,timing,data,parameters,logic,one,region,controller,access,profiling
US11474703B2,memory,timing,data,logic,parameters,one,controller,access,region,profiling
US20230142598A1,memory,timing,logic,parameters,data,controller,one,profiling,system,die


In [111]:
keyword_df

Unnamed: 0_level_0,Patents
Keyword,Unnamed: 1_level_1
ofdm,"US8306148B2, US8625715B2"
addresses,US10019365B2
symbol,"US8306148B2, US8625715B2"
oxide,"US8053849B2, US8753943B2, US8445975B2"
microcode,US7734873B2
...,...
delay,US9679345B2
network,"US9065722B2, US9825843B2"
alloy,"US7893503B2, US8039335B2, US7741167B2, US8..."
bit,US11095910B2


In [112]:
patent_df.to_csv(".\patent_to_keyword.csv")

In [113]:
keyword_df.to_csv(".\keyword_to_patent.csv")