### Loading in the dataset downloaded from PatensView.org. Only for the year 2024.

In [8]:
import pandas as pd
data = pd.read_csv('sum_2024.tsv', sep='\t')

### Slicing the data into a random split of 0.02% of the original. Saving the df as csv file.

In [9]:
df = data.sample(frac=0.02, random_state=42)
df.to_csv('sum_2024_sample.tsv', sep='\t', index=False)
print(df.shape)
print(df.columns)

(4876, 2)
Index(['patent_id', 'summary_text'], dtype='object')


### Inspecting different summaries. Many of them have 'Background' and 'Technical field' which can help the classification and get rid of much of the 'legalese' language that can obscure the classification of the summary.

In [10]:
#df = pd.read_csv('sum_2024_sample.tsv', sep='\t')
#print(df['summary_text'].iloc[3])

### Function to extract 'Technical Field' and 'Background' from summaries.

In [24]:
import pandas as pd
import re

df = pd.read_csv('sum_2024_sample.tsv', sep='\t')

def extract_background_and_technical(text):
    # regex pattern to capture "TECHNICAL FIELD" and "BACKGROUND"
    pattern = r"(TECHNICAL FIELD.*?)(?=BACKGROUND|$)|(BACKGROUND.*?)(?=(\n[A-Z ]+\n)|$)"
    matches = re.findall(pattern, text, re.DOTALL)
    
    extracted_text = " ".join(match[0] + match[1] for match in matches)
    return extracted_text.strip()


# Apply the function to the summary text of the df, and saving it in a new column
# Saving the df to csv file
df['extracted_text'] = df['summary_text'].apply(extract_background_and_technical)
df.to_csv('sum_2024_sample.tsv', sep='\t', index=False)


4876


### Creating a retrieval corpus with predefined descriptions of industries. The corpus is made by ChatGPT from the following promt:

> "Write me retrieval corpus for a patent industry classifier. The corpus should contain a list of dictionaries, 
> where each dictionary has two keys: 'title' and 'text'. The 'title' key should contain the name of an industry, 
> and the 'text' key should contain a description of that industry. 
> The corpus should cover a diverse range of industries."

In [14]:
retrieval_corpus = [
    {
        "title": "Technology", 
        "text": "Breakthroughs in software engineering, cloud computing, artificial intelligence, machine learning algorithms, blockchain, quantum computing, digital communication systems, and virtual/augmented reality applications. Focuses on the design, development, and optimization of digital infrastructures and computing platforms."
    },
    {
        "title": "Healthcare", 
        "text": "Advancements in medical imaging, precision medicine, surgical robotics, telemedicine, wearable health devices, pharmaceutical innovations, genetic therapies, diagnostic tools, and healthcare management systems aimed at improving patient outcomes and reducing costs."
    },
    {
        "title": "Automotive", 
        "text": "Developments in electric and hybrid vehicles, autonomous navigation systems, advanced vehicle safety technologies, engine optimization, sustainable materials for car manufacturing, and innovations in intelligent transportation systems and infrastructure connectivity."
    },
    {
        "title": "Energy", 
        "text": "Progress in renewable energy sources such as solar panels, wind turbines, geothermal systems, battery storage technologies, nuclear energy advancements, smart grids, energy conservation methodologies, and exploration of alternative fuels like hydrogen."
    },
    {
        "title": "Manufacturing", 
        "text": "Innovations in additive manufacturing (3D printing), smart factories, industrial robotics, predictive maintenance, Internet of Things (IoT)-enabled production lines, advanced supply chain logistics, and automated quality control systems for high-precision manufacturing."
    },
    {
        "title": "Consumer Electronics", 
        "text": "Technologies for smart home automation, personal electronics like smartphones and tablets, virtual assistants, wearables, entertainment systems, gaming consoles, IoT-connected devices, and advancements in miniaturization and battery efficiency."
    },
    {
        "title": "Agriculture", 
        "text": "Revolutionary farming practices including drone-assisted crop monitoring, precision irrigation systems, soil analysis technologies, genetically engineered crops, sustainable pest control, vertical farming, and innovations in farm machinery and robotics."
    },
    {
        "title": "Telecommunications", 
        "text": "Innovations in wireless communication, 5G network infrastructure, satellite internet systems, IoT networks, advanced signal processing, fiber-optic technologies, secure communication protocols, and enhanced mobile device connectivity solutions."
    },
    {
        "title": "Aerospace", 
        "text": "Breakthroughs in space exploration technologies, satellite communication systems, aviation safety, unmanned aerial vehicles (drones), propulsion systems, composite materials for aircraft, and development of interplanetary travel solutions."
    },
    {
        "title": "Construction", 
        "text": "Advancements in sustainable building materials, modular construction, smart building technologies, green architecture, 3D-printed structures, construction robotics, structural engineering analysis, and innovations in urban infrastructure planning."
    },
    {
        "title": "Food and Beverage", 
        "text": "Research in food processing technologies, novel food preservation methods, sustainable packaging, nutritional optimization, alternative protein sources like plant-based and lab-grown meat, food safety systems, and beverage formulation innovations."
    },
    {
        "title": "Biotechnology", 
        "text": "Applications of genomics, bioinformatics, genetic engineering, synthetic biology, biopharmaceutical development, regenerative medicine, bioprocess engineering, and the creation of environmentally sustainable bio-based products."
    },
    {
        "title": "Chemicals", 
        "text": "Developments in petrochemicals, green chemistry practices, advanced polymer materials, industrial catalysts, nanomaterials, specialty coatings, chemical recycling technologies, and new methods for reducing the environmental impact of chemical production."
    },
    {
        "title": "Education", 
        "text": "Technologies for online learning platforms, virtual classrooms, interactive educational tools, gamification in learning, artificial intelligence-driven tutoring systems, personalized learning analytics, and advancements in pedagogical methodologies."
    },
    {
        "title": "Finance", 
        "text": "Innovations in digital payment systems, blockchain-based financial solutions, algorithmic trading, risk assessment tools, financial fraud detection, decentralized finance (DeFi) platforms, and advancements in personal finance management apps."
    },
    {
        "title": "Retail", 
        "text": "E-commerce optimization technologies, virtual shopping experiences, inventory management software, last-mile delivery solutions, customer behavior analytics, omnichannel retail strategies, and supply chain efficiency enhancements."
    },
    {
        "title": "Logistics and Transportation", 
        "text": "Advancements in freight management systems, route optimization algorithms, autonomous delivery systems, drone logistics, cold chain technologies, smart warehousing, and sustainable transportation solutions."
    },
    {
        "title": "Environmental Science", 
        "text": "Innovative approaches to renewable resource management, water desalination, air purification systems, carbon capture technologies, waste-to-energy processes, biodiversity conservation methods, and sustainable urban planning initiatives."
    },
    {
        "title": "Defense and Security", 
        "text": "Breakthroughs in cybersecurity frameworks, advanced surveillance technologies, autonomous defense systems, threat detection AI, counter-drone systems, biometric authentication, and innovations in personal and national security equipment."
    },
    {
        "title": "Entertainment and Media", 
        "text": "Innovations in digital content streaming, virtual production tools, immersive virtual reality (VR) and augmented reality (AR) experiences, gaming engines, interactive storytelling, and new distribution platforms for creative content."
    },
    {
        "title": "Textiles and Apparel", 
        "text": "Advances in smart fabrics, wearable technology, sustainable textile manufacturing, recycled materials, 3D knitting technology, fast fashion logistics, and design innovations for performance-oriented clothing."
    },
    {
        "title": "Mining and Materials", 
        "text": "Progress in mineral exploration technologies, sustainable mining practices, advanced metallurgy, composite material development, rare earth element processing, and innovations in resource extraction and refinement."
    },
    {
        "title": "Real Estate", 
        "text": "Smart property management platforms, innovations in real estate investment analysis, green building certifications, virtual property tours, urban planning technologies, and advancements in sustainable housing solutions."
    },
    {
        "title": "Pharmaceuticals", 
        "text": "Research in drug discovery pipelines, biologics manufacturing, personalized medicine, controlled release drug delivery systems, vaccine development, and innovative methods for pharmaceutical formulation and production."
    },
    {
        "title": "Insurance", 
        "text": "Predictive analytics for risk assessment, advancements in insurtech platforms, automated claims processing, customer experience innovations, usage-based insurance models, and AI-powered fraud detection tools."
    },
    {
        "title": "Maritime", 
        "text": "Technological developments in marine navigation, shipbuilding innovations, sustainable shipping practices, ocean exploration equipment, offshore wind energy systems, and advancements in underwater robotics."
    },
    {
        "title": "Sports and Recreation", 
        "text": "Innovations in wearable fitness trackers, smart sports equipment, virtual coaching systems, stadium technologies, e-sports platforms, and advancements in materials for athletic performance optimization."
    }
]

In [28]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
from dash.exceptions import PreventUpdate
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

df = pd.read_csv('sum_2024_sample.tsv', sep='\t')

retrieval_corpus = [
    {
        "title": "Technology", 
        "text": "Breakthroughs in software engineering, cloud computing, artificial intelligence, machine learning algorithms, blockchain, quantum computing, digital communication systems, and virtual/augmented reality applications. Focuses on the design, development, and optimization of digital infrastructures and computing platforms."
    },
    {
        "title": "Healthcare", 
        "text": "Advancements in medical imaging, precision medicine, surgical robotics, telemedicine, wearable health devices, pharmaceutical innovations, genetic therapies, diagnostic tools, and healthcare management systems aimed at improving patient outcomes and reducing costs."
    },
    {
        "title": "Automotive", 
        "text": "Developments in electric and hybrid vehicles, autonomous navigation systems, advanced vehicle safety technologies, engine optimization, sustainable materials for car manufacturing, and innovations in intelligent transportation systems and infrastructure connectivity."
    },
    {
        "title": "Energy", 
        "text": "Progress in renewable energy sources such as solar panels, wind turbines, geothermal systems, battery storage technologies, nuclear energy advancements, smart grids, energy conservation methodologies, and exploration of alternative fuels like hydrogen."
    },
    {
        "title": "Manufacturing", 
        "text": "Innovations in additive manufacturing (3D printing), smart factories, industrial robotics, predictive maintenance, Internet of Things (IoT)-enabled production lines, advanced supply chain logistics, and automated quality control systems for high-precision manufacturing."
    },
    {
        "title": "Consumer Electronics", 
        "text": "Technologies for smart home automation, personal electronics like smartphones and tablets, virtual assistants, wearables, entertainment systems, gaming consoles, IoT-connected devices, and advancements in miniaturization and battery efficiency."
    },
    {
        "title": "Agriculture", 
        "text": "Revolutionary farming practices including drone-assisted crop monitoring, precision irrigation systems, soil analysis technologies, genetically engineered crops, sustainable pest control, vertical farming, and innovations in farm machinery and robotics."
    },
    {
        "title": "Telecommunications", 
        "text": "Innovations in wireless communication, 5G network infrastructure, satellite internet systems, IoT networks, advanced signal processing, fiber-optic technologies, secure communication protocols, and enhanced mobile device connectivity solutions."
    },
    {
        "title": "Aerospace", 
        "text": "Breakthroughs in space exploration technologies, satellite communication systems, aviation safety, unmanned aerial vehicles (drones), propulsion systems, composite materials for aircraft, and development of interplanetary travel solutions."
    },
    {
        "title": "Construction", 
        "text": "Advancements in sustainable building materials, modular construction, smart building technologies, green architecture, 3D-printed structures, construction robotics, structural engineering analysis, and innovations in urban infrastructure planning."
    },
    {
        "title": "Food and Beverage", 
        "text": "Research in food processing technologies, novel food preservation methods, sustainable packaging, nutritional optimization, alternative protein sources like plant-based and lab-grown meat, food safety systems, and beverage formulation innovations."
    },
    {
        "title": "Biotechnology", 
        "text": "Applications of genomics, bioinformatics, genetic engineering, synthetic biology, biopharmaceutical development, regenerative medicine, bioprocess engineering, and the creation of environmentally sustainable bio-based products."
    },
    {
        "title": "Chemicals", 
        "text": "Developments in petrochemicals, green chemistry practices, advanced polymer materials, industrial catalysts, nanomaterials, specialty coatings, chemical recycling technologies, and new methods for reducing the environmental impact of chemical production."
    },
    {
        "title": "Education", 
        "text": "Technologies for online learning platforms, virtual classrooms, interactive educational tools, gamification in learning, artificial intelligence-driven tutoring systems, personalized learning analytics, and advancements in pedagogical methodologies."
    },
    {
        "title": "Finance", 
        "text": "Innovations in digital payment systems, blockchain-based financial solutions, algorithmic trading, risk assessment tools, financial fraud detection, decentralized finance (DeFi) platforms, and advancements in personal finance management apps."
    },
    {
        "title": "Retail", 
        "text": "E-commerce optimization technologies, virtual shopping experiences, inventory management software, last-mile delivery solutions, customer behavior analytics, omnichannel retail strategies, and supply chain efficiency enhancements."
    },
    {
        "title": "Logistics and Transportation", 
        "text": "Advancements in freight management systems, route optimization algorithms, autonomous delivery systems, drone logistics, cold chain technologies, smart warehousing, and sustainable transportation solutions."
    },
    {
        "title": "Environmental Science", 
        "text": "Innovative approaches to renewable resource management, water desalination, air purification systems, carbon capture technologies, waste-to-energy processes, biodiversity conservation methods, and sustainable urban planning initiatives."
    },
    {
        "title": "Defense and Security", 
        "text": "Breakthroughs in cybersecurity frameworks, advanced surveillance technologies, autonomous defense systems, threat detection AI, counter-drone systems, biometric authentication, and innovations in personal and national security equipment."
    },
    {
        "title": "Entertainment and Media", 
        "text": "Innovations in digital content streaming, virtual production tools, immersive virtual reality (VR) and augmented reality (AR) experiences, gaming engines, interactive storytelling, and new distribution platforms for creative content."
    },
    {
        "title": "Textiles and Apparel", 
        "text": "Advances in smart fabrics, wearable technology, sustainable textile manufacturing, recycled materials, 3D knitting technology, fast fashion logistics, and design innovations for performance-oriented clothing."
    },
    {
        "title": "Mining and Materials", 
        "text": "Progress in mineral exploration technologies, sustainable mining practices, advanced metallurgy, composite material development, rare earth element processing, and innovations in resource extraction and refinement."
    },
    {
        "title": "Real Estate", 
        "text": "Smart property management platforms, innovations in real estate investment analysis, green building certifications, virtual property tours, urban planning technologies, and advancements in sustainable housing solutions."
    },
    {
        "title": "Pharmaceuticals", 
        "text": "Research in drug discovery pipelines, biologics manufacturing, personalized medicine, controlled release drug delivery systems, vaccine development, and innovative methods for pharmaceutical formulation and production."
    },
    {
        "title": "Insurance", 
        "text": "Predictive analytics for risk assessment, advancements in insurtech platforms, automated claims processing, customer experience innovations, usage-based insurance models, and AI-powered fraud detection tools."
    },
    {
        "title": "Maritime", 
        "text": "Technological developments in marine navigation, shipbuilding innovations, sustainable shipping practices, ocean exploration equipment, offshore wind energy systems, and advancements in underwater robotics."
    },
    {
        "title": "Sports and Recreation", 
        "text": "Innovations in wearable fitness trackers, smart sports equipment, virtual coaching systems, stadium technologies, e-sports platforms, and advancements in materials for athletic performance optimization."
    }
]

# NLTK setup for text preprocessing
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()
# Load the pre-trained Sentence Transformer model for embedding text
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to preprocess text by removing stopwords and lemmatizing words
def remove_stopwords_and_lemmatize(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    # Tokenize the text
    words = word_tokenize(text)
    # Lemmatize each word and remove stopwords
    lemmatized_words = [
        lemmatizer.lemmatize(word) 
        for word in words if word.lower() not in stop_words
    ]
    # Return the lemmatized, stopword-free text
    return ' '.join(lemmatized_words)

# There are some empty strings in the 'extracted_text' column, which need to be removed
df_clean = df.dropna(subset=['extracted_text'])  
df_clean = df_clean[df_clean['extracted_text'].str.strip() != '']  

# Using the 'remove_stopwords_and_lemmatize' function to preprocess the patent summaries
patent_summaries = [remove_stopwords_and_lemmatize(text) for text in df_clean['extracted_text'].tolist()]

# Embed the preprocessed patent summaries using the Sentence Transformer model
patent_embeddings = model.encode(patent_summaries)

# Embed the industry descriptions using the Sentence Transformer model
industry_descriptions = [remove_stopwords_and_lemmatize(doc['text']) for doc in retrieval_corpus]
industry_embeddings = model.encode(industry_descriptions)

"""
In order to evaluate the 'goodness' of classification with cosine-similarity I use a 
silhouette score inspired method. The silhouette score is a measure of how similar an object is 
to its own cluster (cohesion) compared to other clusters (separation). In this case, I can
calculate a silhouette score for each patent summary based on its cosine similarity to the top 5
industry descriptions and its separation from the remaining industry descriptions. 

"""

def calculate_cosine_silhouette_score(patent_embedding, top_industries, top_similarities, all_industries, all_similarities):
    # Closeness: mean similarity to the top 5 industry descriptions
    top_closeness = np.mean(top_similarities)  

    # Separation: mean similarity to the remaining industry descriptions (those not in the top 5)
    remaining_similarities = [similarity for i, similarity in enumerate(all_similarities) if industry_titles[i] not in top_industries]
    separation = np.mean(remaining_similarities)  

    # Calculate the cosine similarity-based silhouette score
    silhouette_score = (top_closeness - separation) / max(top_closeness, separation)
    return silhouette_score


silhouette_scores = []
industry_titles = [doc['title'] for doc in retrieval_corpus]  

for patent_embedding in patent_embeddings:
    # Calculate cosine similarities between the patent and all industry descriptions
    similarities = cosine_similarity([patent_embedding], industry_embeddings)[0]

    # Get the top 5 most similar industries and fetch industry titles for them
    top_indices = similarities.argsort()[-5:][::-1]  
    top_industries = [industry_titles[i] for i in top_indices]  
    top_similarities = [similarities[i] for i in top_indices]

    # Get the remaining similarities (industries not in the top 5)
    all_industries = [industry_titles[i] for i in range(len(industry_titles))]
    all_similarities = similarities.tolist()

    # Calculate the silhouette score for the patent summary and append it to the score list
    score = calculate_cosine_silhouette_score(patent_embedding, top_industries, top_similarities, all_industries, all_similarities)
    silhouette_scores.append(score)


df_clean['silhouette_score'] = silhouette_scores
df = df.merge(df_clean[['patent_id', 'silhouette_score']], on='patent_id', how='left')
print(df[['patent_id', 'silhouette_score']])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


     patent_id  silhouette_score
0     11970780          0.522239
1     12094995          0.490203
2     12061367          0.651340
3     12022342          0.503256
4     11945748          0.782131
...        ...               ...
4871   PP36001          0.653154
4872  11906217          0.390187
4873  12087041          0.652972
4874  11952755          0.529128
4875  12031125          0.784419

[4876 rows x 2 columns]


In [29]:
average_silhouette_score = df['silhouette_score'].mean()
print(average_silhouette_score)

0.5356275482558656


### An average silhouette score of 0.54 suggests that the summaries are moderately well classified in terms of their similarities with pre-defined industry descriptions. 