# Patent data extraction
- Using PatentSearch API
- Request API key via here (https://patentsview-support.atlassian.net/servicedesk/customer/portal/1/group/1/create/18)
- target patent = ondevice ai domain

In [None]:
import requests
import json
import pandas as pd
import time

API_KEY = "actual key" #private key
endpoint = "https://search.patentsview.org/api/v1/patent/"

def fetch_patents_new(query_obj, fields, per_page=1000, max_pages=None):
    all_patents = []
    page = 0
    size = per_page if per_page <= 1000 else 1000
    after = None

    headers = {
        "X-Api-Key": API_KEY,
        "Content-Type": "application/json",
    }

    while True:
        options = {"size": size}
        if after is not None:
            options["after"] = after

        payload = {
            "q": query_obj,
            "f": fields,
            "o": options,
            # ascending
            "s": [
                {"patent_date": "asc"},
                {"patent_id": "asc"}
            ]
        }

        resp = requests.post(endpoint, headers=headers, json=payload)
        data = resp.json()

        if data.get("error"):
            raise Exception(f"API error: {data}")

        batch = data.get("patents", [])
        if not batch:
            break

        all_patents.extend(batch)
        page += 1
        print(f"Fetched batch {page}, total so far: {len(all_patents)}")

        # page limit
        if max_pages and page >= max_pages:
            break

        last = batch[-1]
        after = [
        last["patent_date"],  # 1st
        last["patent_id"]     # 2nd
        ]


        time.sleep(1)

    return pd.DataFrame(all_patents)


# ================== ondevice ai domain ==================

query = {
    "_and": [

        # ========================================================
        # DEVICE & HARDWARE & EMBEDDED SIGNALS
        # ========================================================
        {
            "_or": [
                {"_text_phrase": {"patent_abstract": "ondevice"}},
                {"_text_phrase": {"patent_abstract": "on-device"}},
                {"_text_phrase": {"patent_abstract": "ondevice ai"}},
                {"_text_all": {"patent_abstract": "edge device"}},
                {"_text_all": {"patent_abstract": "edge ai"}},
                {"_text_all": {"patent_abstract": "embedded device"}},
                {"_text_all": {"patent_abstract": "embedded ai"}},
                {"_text_all": {"patent_abstract": "IoT device"}},
                {"_text_all": {"patent_abstract": "mobile device"}},
                {"_text_all": {"patent_abstract": "wearable device"}},
                {"_text_all": {"patent_abstract": "ai device"}},
                {"_text_all": {"patent_abstract": "smart controller"}},
                {"_text_all": {"patent_abstract": "intelligent controller"}},

                {"_text_phrase": {"patent_abstract": "autonomous device"}},
                {"_text_phrase": {"patent_abstract": "wearable AI"}},
                {"_text_phrase": {"patent_abstract": "on-device vision"}},
                {"_text_phrase": {"patent_abstract": "vision device"}},
                {"_text_any":   {"patent_abstract": "AIoT"}},

                {"_text_any": {"patent_abstract": "robot"}},
                {"_text_any": {"patent_abstract": "sensor"}},
                {"_text_any": {"patent_abstract": "processor"}}
            ]
        },


        # ========================================================
        # INFERENCE + MODEL + CONNECTIVITY
        # ========================================================
        {
            "_or": [

                # ================================
                # Inference
                # ================================
                {"_text_all": {"patent_abstract": "on-device inference"}},
                {"_text_all": {"patent_abstract": "local inference"}},
                {"_text_all": {"patent_abstract": "real-time inference"}},
                {"_text_all": {"patent_abstract": "low-latency inference"}},
                {"_text_all": {"patent_abstract": "inference accelerator"}},
                {"_text_all": {"patent_abstract": "distributed inference"}},
                {"_text_any": {"patent_abstract": "inference"}},
                {"_text_all": {"patent_abstract": "on-device processing"}},
                {"_text_all": {"patent_abstract": "device-side inference"}},
                {"_text_all": {"patent_abstract": "onboard inference"}},


                # ================================
                # Light Model / ML, DL
                # ================================
                {"_text_all": {"patent_abstract": "lightweight model"}},
                {"_text_all": {"patent_abstract": "quantized model"}},
                {"_text_all": {"patent_abstract": "quantized neural network"}},
                {"_text_all": {"patent_abstract": "compressed model"}},
                {"_text_any": {"patent_abstract": "TinyML"}},
                {"_text_all": {"patent_abstract": "federated learning"}},

                # --- Model execution / Learning on device ---
                {"_text_all": {"patent_abstract": "device-side learning"}},
                {"_text_all": {"patent_abstract": "onboard computing"}},
                {"_text_all": {"patent_abstract": "local model execution"}},
                {"_text_all": {"patent_abstract": "on-device model execution"}},

                # --- Efficient model architectures  ---
                {"_text_all": {"patent_abstract": "pruned model"}},
                {"_text_all": {"patent_abstract": "model pruning"}},
                {"_text_all": {"patent_abstract": "sparse model"}},
                {"_text_all": {"patent_abstract": "sparsity"}},
                {"_text_all": {"patent_abstract": "distilled model"}},
                {"_text_all": {"patent_abstract": "knowledge distillation"}},
                {"_text_all": {"patent_abstract": "efficient neural network"}},
                {"_text_all": {"patent_abstract": "mobile neural network"}},


                # ================================
                # Hardware acceleration
                # ================================
                {"_text_all": {"patent_abstract": "hardware accelerator"}},
                {"_text_any": {"patent_abstract": "DSP"}},
                {"_text_any": {"patent_abstract": "ASIC"}},
                {"_text_all": {"patent_abstract": "inference ASIC"}},
                {"_text_all": {"patent_abstract": "FPGA"}},
                {"_text_all": {"patent_abstract": "FPGA inference"}},
                {"_text_phrase": {"patent_abstract": "AI chip"}},
                {"_text_any":   {"patent_abstract": "NPU"}},
                {"_text_any":   {"patent_abstract": "TPU"}},
                {"_text_phrase": {"patent_abstract": "ai accelerator"}},


                # ================================
                # Low-power / Resource constraints
                # ================================
                {"_text_all": {"patent_abstract": "low power inference"}},
                {"_text_all": {"patent_abstract": "power-efficient"}},
                {"_text_all": {"patent_abstract": "energy-efficient"}},
                {"_text_all": {"patent_abstract": "energy-efficient inference"}},
                {"_text_all": {"patent_abstract": "resource-constrained device"}},
                {"_text_all": {"patent_abstract": "limited-resource device"}},
                {"_text_all": {"patent_abstract": "compute-constrained device"}},


                # ================================
                # AI + Device
                # ================================
                {
                    "_and": [
                        {"_text_all": {"patent_abstract": "artificial intelligence"}},
                        {
                            "_or": [
                                {"_text_any": {"patent_abstract": "device"}},
                                {"_text_any": {"patent_abstract": "edge"}},
                                {"_text_any": {"patent_abstract": "embedded"}},
                                {"_text_any": {"patent_abstract": "processor"}},
                                {"_text_any": {"patent_abstract": "sensor"}},
                                {"_text_any": {"patent_abstract": "chip"}},
                                {"_text_any": {"patent_abstract": "controller"}},
                                {"_text_any": {"patent_abstract": "robot"}}
                            ]
                        }
                    ]
                },

                # ================================
                # GenAI + Device
                # ================================
                {
                    "_and": [
                        {"_text_all": {"patent_abstract": "generative ai"}},
                        {
                            "_or": [
                                {"_text_any": {"patent_abstract": "device"}},
                                {"_text_any": {"patent_abstract": "edge"}},
                                {"_text_any": {"patent_abstract": "embedded"}},
                                {"_text_any": {"patent_abstract": "processor"}},
                                {"_text_any": {"patent_abstract": "sensor"}},
                                {"_text_any": {"patent_abstract": "chip"}},
                                {"_text_any": {"patent_abstract": "controller"}},
                                {"_text_any": {"patent_abstract": "robot"}}
                            ]
                        }
                    ]
                },


                # ================================
                # Connectivity
                # ================================
                {"_text_all": {"patent_abstract": "offline AI"}},
                {"_text_all": {"patent_abstract": "disconnected AI"}},
                {"_text_all": {"patent_abstract": "edge computing"}}
            ]
        },


        # ========== date ==========
        {"_gte": {"patent_date": "2014-01-01"}},
        {"_lte": {"patent_date": "2024-12-31"}}
    ]
}


fields = [
    "patent_id",
    "patent_title",
    "patent_abstract",
    "patent_date"
]


df = fetch_patents_new(query, fields, per_page=1000)
print(df.head())


In [None]:
newdf = df[['patent_id', 'patent_title', 'patent_abstract', 'patent_date']]
newdf.shape() #(14333, 4)

# Crunchbase data
- The Crunchbase dataset used in this project was obtained through a membership license and therefore cannot be publicly shared. Only the processing scripts are provided.


In [None]:
df2 = pd.read_csv('/content/new.csv',
                 encoding='utf-8-sig',
                 encoding_errors='ignore')

df2.shape (1520, 42)

# Concatenate

In [None]:
abs = df.dropna(subset=['patent_id', 'patent_abstract', 'patent_date'])[
    ['patent_id', 'patent_abstract', 'patent_date']
].drop_duplicates(subset='patent_id')

cru = df2.dropna(subset=['Organization Name', 'Description', 'Full Description', 'Founded Date'])[
    ['Organization Name', 'Full Description', 'Founded Date']
].drop_duplicates(subset='Organization Name')

In [None]:
abs['source'] = 'patent'
abs = abs.rename(columns={'patent_id': 'id', 'patent_abstract': 'text', 'patent_date': 'date'})

cru['source'] = 'startup'
cru = cru.rename(columns={'Organization Name': 'id', 'Full Description': 'text', 'Founded Date': 'date'})

# concat
combined = pd.concat([abs, cru], ignore_index=True)
combined

In [None]:
if not pd.api.types.is_datetime64_any_dtype(combined['date']):
    combined['date'] = pd.to_datetime(combined['date'])

# train / test
start_date = pd.to_datetime('2014-01-01')
end_date = pd.to_datetime('2022-12-31')

train_data = combined[(combined['date'] >= start_date) & (combined['date'] <= end_date)]
test_data = combined[combined['date'] > end_date]

# filtering date
before_2014_data = combined[combined['date'] < start_date]

print(f"before 2014: {before_2014_data.shape}")
print(f"train shape: {train_data.shape}")
print(f"test shape: {test_data.shape}")

# source distribution
print("\ntrain source distribution:")
print(train_data['source'].value_counts())

print("\ntest source distribution:")
print(test_data['source'].value_counts())

# date range
print("\ntrain date range:")
print(f"start: {train_data['date'].min()}")
print(f"end: {train_data['date'].max()}")

print("\ntest date range:")
print(f"start: {test_data['date'].min()}")
print(f"end: {test_data['date'].max()}")

In [None]:
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)