In [33]:
from docx import Document
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\johnt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [68]:
def read_doc(file_path, patent_start=r'^\s*US[0-9A-Z]*$'):
    doc = Document(file_path)
    labels = []
    patents = []
    current_patent_text = []
    
    for i, paragraph in enumerate(doc.paragraphs):
        if re.search(patent_start, paragraph.text):
            labels.append(paragraph.text)
            if current_patent_text:
                patents.append('\n'.join(current_patent_text))
            current_patent_text = []
        else:
            # If not a patent start, add the paragraph text to the current patent text
            current_patent_text.append(paragraph.text)
            
    # Add the last patent text
    if current_patent_text:
        patents.append('\n'.join(current_patent_text))

    return patents, labels

In [69]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [70]:
def calculate_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names_out()
    return tfidf_matrix, feature_names

In [91]:
def extract_keywords(tfidf_matrix, feature_names, top_n=10):
    document_tfidf = tfidf_matrix[0].toarray()[0]
    keywords = [feature_names[i] for i in document_tfidf.argsort()[-top_n:][::-1]]
    return keywords

In [71]:
patents, labels = read_doc(r'C:\Users\johnt\Downloads\20240106021625210-AMD (2).docx')

In [72]:
for i in range(len(patents)):
    patents[i] = preprocess_text(patents[i])

In [74]:
label2pat = dict(zip(labels, patents))

In [98]:
patent_keywords = {}

In [100]:
for label, patent in label2pat.items():
    X, feature_names = calculate_tfidf([patent])
    keywords = extract_keywords(X, feature_names)
    
    patent_keywords[label] = keywords

In [107]:
patent_keywords[labels[95]]

['image',
 'sampling',
 'first',
 'one',
 'pattern',
 'embodiment',
 'second',
 'pixel',
 'processor',
 'lines']