In [1]:
import pandas as pd

In [2]:
dtypes = {
    "patent_id": "string",
    "forward_citations": "int64",
}
fwd_citations = pd.read_csv("./data/final_fwdcitation.csv", dtype=dtypes)

fwd_citations.head()

Unnamed: 0,patent_id,patent_type,patent_date,patent_title,wipo_kind,forward_citations
0,10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,B2,13
1,10000001,utility,2018-06-19,Injection molding machine and mold thickness c...,B2,0
2,10000002,utility,2018-06-19,Method for manufacturing polymer film and co-e...,B2,0
3,10000003,utility,2018-06-19,Method for producing a container from a thermo...,B2,2
4,10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, c...",B2,0


In [3]:
dtypes = {
    "patent_id": "string",
    "cpc_sequence": "int64",
}
cpc = pd.read_csv(
    "./data/g_cpc_current.tsv", 
    sep="\t",
    dtype=dtypes,
    )
cpc.head()

Unnamed: 0,patent_id,cpc_sequence,cpc_section,cpc_class,cpc_subclass,cpc_group,cpc_type
0,3950000,0,A,A63,A63C,A63C9/001,inventional
1,3950000,1,A,A63,A63C,A63C9/00,inventional
2,3950000,2,A,A63,A63C,A63C9/002,inventional
3,3950000,3,A,A63,A63C,A63C9/081,inventional
4,3950001,0,A,A63,A63C,A63C9/086,inventional


In [4]:
citations_dtypes = {
    "patent_id": "string",
    "citation_patent_id": "string",
}
citations = pd.read_csv(
    "./data/g_us_patent_citation.tsv", 
    sep="\t",
    dtype=citations_dtypes,
    )
citations.head()

  citations = pd.read_csv(


Unnamed: 0,patent_id,citation_sequence,citation_patent_id,citation_date,record_name,wipo_kind,citation_category
0,10000000,0,5093563,1992-03-01,Small,A,cited by examiner
1,10000000,1,5751830,1998-05-01,Hutchinson,A,cited by applicant
2,10000001,0,7804268,2010-09-01,Park,B2,cited by examiner
3,10000001,1,9022767,2015-05-01,Oono,B2,cited by examiner
4,10000001,2,9090016,2015-07-01,Takeuchi,B2,cited by examiner


In [5]:
fwd_citations = fwd_citations[["patent_id", "forward_citations"]]
cpc = cpc[["patent_id", "cpc_sequence", "cpc_subclass", "cpc_group"]]
citations = citations[["patent_id", "citation_sequence", "citation_patent_id"]]

In [6]:
num_patents = fwd_citations["patent_id"].nunique()
print(f"Number of patents: {num_patents}")

Number of patents: 7507819


In [7]:
from itertools import combinations
import os

#1. Group by patent_id and collect all cpc_subclasses for each patent
# This will create a dataframe where each patent_id maps to a set of its cpc_subclasses
cpc_subclasses = cpc.groupby("patent_id")["cpc_subclass"].apply(set)
cpc_subclasses = pd.merge(
    cpc_subclasses,
    fwd_citations.set_index("patent_id"), 
    left_index=True, 
    right_index=True,
    how="right",
    validate="1:1"
)
cpc_subclasses = cpc_subclasses["cpc_subclass"]
cpc_subclasses = cpc_subclasses.rename("cpc_subclasses")
cpc_subclasses.head()


patent_id
10000000                                  {G01S}
10000001                            {G05B, B29C}
10000002    {B29C, B29D, B29K, B60C, B32B, B29L}
10000003                {B29L, B29C, B29K, B29D}
10000004                      {B29C, B29K, B29L}
Name: cpc_subclasses, dtype: object

In [8]:
# Check the number of patents with no cpc subclasses cited
cpc_subclasses.isna().sum()

763040

In [9]:
import ast
# 2. Generate all unique pairs of cpc_subclasses for each patent
def generate_pairs(cpc_subclass: set) -> list:
    if np.isnan(cpc_subclass):
        return []
    if not isinstance(cpc_subclass, set):
        cpc_subclass = set(cpc_subclass)
    if not cpc_subclass:
        return []
    return list(combinations(sorted(cpc_subclass), 2))

patent_cpc_subclass_pairs = None
if not os.path.exists("./data/cpc_subclass_pairs.csv"):
    patent_cpc_subclass_pairs = cpc_subclasses.apply(generate_pairs)
    patent_cpc_subclass_pairs = patent_cpc_subclass_pairs.rename("cpc_subclass_pairs")
    patent_cpc_subclass_pairs.to_csv("./data/cpc_subclass_pairs.csv")
else:
    patent_cpc_subclass_pairs = pd.read_csv("./data/cpc_subclass_pairs.csv", index_col=0, dtype={"patent_id": "string"})
    patent_cpc_subclass_pairs["cpc_subclass_pairs"] = patent_cpc_subclass_pairs["cpc_subclass_pairs"].apply(ast.literal_eval)
    patent_cpc_subclass_pairs = patent_cpc_subclass_pairs["cpc_subclass_pairs"]
patent_cpc_subclass_pairs.head()

patent_id
10000000                                                   []
10000001                                       [(B29C, G05B)]
10000002    [(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...
10000003    [(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...
10000004           [(B29C, B29K), (B29C, B29L), (B29K, B29L)]
Name: cpc_subclass_pairs, dtype: object

In [10]:
# 3. Flatten the list of pairs into a long-form DataFrame
# Each row maps a patent to one of its cpc subclass pair, or NaN if it has fewer than 2 cpc subclasses cited
patent_cpc_subclass_pairs_long = patent_cpc_subclass_pairs.explode()
patent_cpc_subclass_pairs_long = patent_cpc_subclass_pairs_long.rename("cpc_subclass_pair")
patent_cpc_subclass_pairs_long = patent_cpc_subclass_pairs_long.to_frame().reset_index()
patent_cpc_subclass_pairs_long.head()

Unnamed: 0,patent_id,cpc_subclass_pair
0,10000000,
1,10000001,"(B29C, G05B)"
2,10000002,"(B29C, B29D)"
3,10000002,"(B29C, B29K)"
4,10000002,"(B29C, B29L)"


In [11]:
# 4. Count occurrences of each unique pair
if not os.path.exists("./data/cpc_subclass_pair_counts.csv"):
    pair_counts = patent_cpc_subclass_pairs_long.groupby("cpc_subclass_pair").size()
    pair_counts = pair_counts.to_frame(name="count")
    pair_counts.to_csv("./data/cpc_subclass_pair_counts.csv")
else:
    pair_counts = pd.read_csv("./data/cpc_subclass_pair_counts.csv")
    pair_counts["cpc_subclass_pair"] = pair_counts["cpc_subclass_pair"].apply(ast.literal_eval)
    pair_counts = pair_counts.set_index("cpc_subclass_pair")

pair_counts.head()

Unnamed: 0_level_0,count
cpc_subclass_pair,Unnamed: 1_level_1
"(A01B, A01C)",1662
"(A01B, A01D)",1209
"(A01B, A01F)",133
"(A01B, A01G)",468
"(A01B, A01H)",15


In [12]:
total_pairs = pair_counts.sum()
print(f"Total number of pairs in the dataset: {total_pairs}")

num_unique_pairs = pair_counts.index.nunique()
print(f"Number of unique pairs in the dataset: {num_unique_pairs}")

Total number of pairs in the dataset: count    14580344
dtype: int64
Number of unique pairs in the dataset: 95614


In [13]:
import numpy as np
# 5. Assign each cpc subclass pair an atypicality score
def atypicality_score(n):
    if n > 0:
        return -np.log(n/total_pairs)
    else:
        # This should not happen since the pairs are generated from existing data. If it does, raise an error.
        raise ValueError("Count must be greater than 0")

if not os.path.exists("./data/cpc_subclass_pair_counts_with_atypicality.csv"):
    pair_counts["atypicality_score"] = pair_counts["count"].apply(atypicality_score)
    pair_counts.to_csv("./data/cpc_subclass_pair_counts_with_atypicality.csv")
else: 
    pair_counts = pd.read_csv("./data/cpc_subclass_pair_counts_with_atypicality.csv")
    pair_counts["cpc_subclass_pair"] = pair_counts["cpc_subclass_pair"].apply(ast.literal_eval)
    pair_counts = pair_counts.set_index("cpc_subclass_pair")
pair_counts.head()




Unnamed: 0_level_0,count,atypicality_score
cpc_subclass_pair,Unnamed: 1_level_1,Unnamed: 2_level_1
"(A01B, A01C)",1662,9.079408
"(A01B, A01D)",1209,9.397636
"(A01B, A01F)",133,11.604836
"(A01B, A01G)",468,10.346717
"(A01B, A01H)",15,13.787135


In [14]:
# Atypicality score per cpc subclass pair summary statistics
pair_counts["atypicality_score"].describe()

count    95614.000000
mean        14.105618
std          1.963772
min          4.489449
25%         12.857599
50%         14.415743
75%         15.802038
max         16.495185
Name: atypicality_score, dtype: float64

In [15]:
pair_counts_dict = pair_counts.to_dict(orient="index")

In [16]:
# Step 6: Calculate the atypicality score for each patent
def calculate_atypicality_score(cpc_subclass_pairs):
    if len(cpc_subclass_pairs) == 0:
        return np.nan
    # Get the atypicality score for each cpc_subclass pair
    scores = [pair_counts_dict.get(tuple(pair), {}).get("atypicality_score", 0) for pair in cpc_group_pairs]
    if len(scores) == 0:
        raise RuntimeError("No matching pairs found for the given cpc_group_pairs.")
    
    # Calculate the atypicality score for each patent = average atypicality score for all cpc subclass pairs it cites
    atypicality_score = np.mean(scores)
    
    return atypicality_score

if not os.path.exists("./data/patents_with_atypicality.csv"):
    if not isinstance(patent_cpc_subclass_pairs, pd.DataFrame):
        patent_cpc_subclass_pairs = patent_cpc_subclass_pairs.to_frame()
    # Group by patent_id and calculate the atypicality score for each patent
    patent_cpc_subclass_pairs["atypicality_score"] = patent_cpc_subclass_pairs["cpc_subclass_pairs"].apply(calculate_atypicality_score)
    patent_cpc_subclass_pairs.to_csv("./data/patents_with_atypicality.csv")
else:
    patent_cpc_subclass_pairs = pd.read_csv("./data/patents_with_atypicality.csv", index_col=0)
    patent_cpc_subclass_pairs["cpc_subclass_pairs"] = patent_cpc_subclass_pairs["cpc_subclass_pairs"].apply(ast.literal_eval)


patent_cpc_subclass_pairs.head()

  patent_cpc_subclass_pairs = pd.read_csv("./data/patents_with_atypicality.csv", index_col=0)


Unnamed: 0_level_0,cpc_subclass_pairs,atypicality_score,cpc_subclasses
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000000,[],,{'G01S'}
10000001,"[(B29C, G05B)]",8.984207,"{'G05B', 'B29C'}"
10000002,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",8.37534,"{'B29D', 'B29K', 'B29L', 'B32B', 'B60C', 'B29C'}"
10000003,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",6.990229,"{'B29D', 'B29L', 'B29C', 'B29K'}"
10000004,"[(B29C, B29K), (B29C, B29L), (B29K, B29L)]",6.23967,"{'B29K', 'B29L', 'B29C'}"


In [17]:
# Atypicality score summary statistics
patent_cpc_subclass_pairs["atypicality_score"].describe()

count    4.056170e+06
mean     8.093600e+00
std      1.987437e+00
min      4.489449e+00
25%      6.627396e+00
50%      8.040293e+00
75%      9.427694e+00
max      1.649518e+01
Name: atypicality_score, dtype: float64

In [18]:
# 8. Merge the atypicality scores with the list of cpc subclasses to get the full output
patent_cpc_subclass_pairs_with_atypicality = pd.merge(patent_cpc_subclass_pairs, cpc_subclasses, right_index=True, left_index=True, how="inner", validate="1:1")
patent_cpc_subclass_pairs_with_atypicality.head()

Unnamed: 0_level_0,cpc_subclass_pairs,atypicality_score,cpc_subclasses_x,cpc_subclasses_y
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9921365,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",7.725865,"{'B29D', 'B29K', 'B29L', 'G02B', 'B29C'}","{B29C, B29D, G02B, B29K, B29L}"
9921366,"[(G02B, G02F), (G02B, G06F), (G02B, H05K), (G0...",7.444244,"{'G02B', 'G06F', 'G02F', 'H05K'}","{G02B, G02F, G06F, H05K}"
9921367,"[(G02B, G02F)]",6.509012,"{'G02B', 'G02F'}","{G02B, G02F}"
9921368,"[(F21S, F21V), (F21S, G02B), (F21V, G02B)]",7.576098,"{'G02B', 'F21S', 'F21V'}","{G02B, F21S, F21V}"
9921369,[],,{'G02B'},{G02B}


In [19]:
# Step 9: Save the final DataFrame to a CSV file
patent_cpc_subclass_pairs_with_atypicality.to_csv("./data/patents_with_atypicality.csv")

In [20]:
# 10. Validate that the output dataset contains the expected number of patents
num_patents = patent_cpc_subclass_pairs_with_atypicality.shape[0]
print(f"Number of patents in the output dataset: {num_patents}")

Number of patents in the output dataset: 823147


In [21]:
patent_cpc_subclass_pairs_with_atypicality.isna().sum()

cpc_subclass_pairs         0
atypicality_score     770922
cpc_subclasses_x      745313
cpc_subclasses_y      745313
dtype: int64

In [25]:
# 11. Validate that all the patents with NaN atypicality scores are indeed patents with fewer than 2 cpc_subclass_pairs
assert patent_cpc_subclass_pairs_with_atypicality[patent_cpc_subclass_pairs_with_atypicality["atypicality_score"].isna()]["cpc_subclass_pairs"].apply(lambda x: len(x) < 2).all(), "There are patents with NaN atypicality scores that have more than 1 cpc_subclass_pairs."
print("All patents with NaN atypicality scores have fewer than 2 cpc_subclass_pairs.")

All patents with NaN atypicality scores have fewer than 2 cpc_subclass_pairs.


In [33]:
# 12. Validate that no cpc_subclass_pairs have been over-counted, i.e. the sum of counts in pair_counts matches the total number of pairs in patent_cpc_subclass_pairs_long
assert pair_counts["count"].sum() == patent_cpc_subclass_pairs_long.dropna(subset=["cpc_subclass_pair"]).shape[0], f"The sum of counts in pair_counts ({pair_counts['count'].sum()}) does not match the total number of pairs in patent_cpc_subclass_pairs_long ({patent_cpc_subclass_pairs_long.dropna(subset=['cpc_subclass_pair']).shape[0]})."
print("All cpc_subclass_pairs have been counted correctly.")

All cpc_subclass_pairs have been counted correctly.


In [39]:
# 13. Random spot check: Validate that the atypicality score for a random patent matches the expected value
import random
def manual_check(patent_id):
    cpc_subclass_pairs = patent_cpc_subclass_pairs.loc[patent_id, "cpc_subclass_pairs"]
    if isinstance(cpc_subclass_pairs, str):
        cpc_subclass_pairs = ast.literal_eval(cpc_subclass_pairs)
    if len(cpc_subclass_pairs) == 0:
        return np.nan
    scores = [pair_counts_dict.get(tuple(pair), {}).get("atypicality_score", 0) for pair in cpc_subclass_pairs]
    atypicality_score = np.mean(scores)
    return atypicality_score

samples = random.sample(patent_cpc_subclass_pairs_with_atypicality.index.tolist(), 10000)
for sample in samples:
    expected_score = manual_check(sample)
    actual_score = patent_cpc_subclass_pairs_with_atypicality.loc[sample, "atypicality_score"]
    assert np.isclose(expected_score, actual_score, equal_nan=True), f"Expected {expected_score}, but got {actual_score} for patent {sample}."
print("All manual checks passed successfully.")


All manual checks passed successfully.


# Technological Leap Score

## Steps:
1. Write the list of CPC groups for the new patent
2. Write the list of CPC groups for all patents it cites
3. Count the number of overlapping CPC groups between these two lists
4. Count the number of total CPC groups in the new patent and all patents it cites (union of both lists)
5. Similarity score = 
$$\text{Similarity Score} = \frac{\text{Number of Overlapping CPC groups}}{\text{Total CPC groups in New Patent and Cited Patents}}$$
6. Technological Leap Score = 1 - Similarity Score

In [66]:
# 1. Create a dictionary mapping patent IDs to a list of patent IDs they cite
citations_list = citations.groupby("patent_id")["citation_patent_id"].apply(list)
citations_dict = citations_list.to_dict()

In [71]:
# 2. Create a dictionary mapping patent IDs to a list of their CPC groups
cpc_groups = cpc.groupby("patent_id")["cpc_group"].apply(set)
cpc_groups = pd.merge(
    cpc_groups,
    fwd_citations.set_index("patent_id"), 
    left_index=True, 
    right_index=True,
    how="right",
    validate="1:1"
)
cpc_groups = cpc_groups["cpc_group"]
cpc_groups = cpc_groups.rename("cpc_groups")
cpc_groups.head()

patent_id
10000000    {G01S17/894, G01S7/4917, G01S7/4865, G01S7/491...
10000001    {B29C45/1751, B29C45/76, G05B2219/45244, B29C4...
10000002    {B29C48/07, B32B27/285, B32B2274/00, B32B27/36...
10000003    {B29C49/20, B29K2101/12, B29C2049/4881, B29C49...
10000004    {B29C48/912, B29C48/10, B29C51/14, B29C48/0018...
Name: cpc_groups, dtype: object

In [77]:
cpc_groups.isna().sum()

763040

In [73]:
cpc_groups_dict = cpc_groups.to_dict()

In [81]:
# 3. Get the intersection of CPC groups between a patent and its backward citations
def get_intersect_cpc_groups(row):
    patent_id = row.name
    patent_cpc_groups = cpc_groups_dict.get(patent_id, {})
    if not patent_cpc_groups or (isinstance(patent_cpc_groups, float) and np.isnan(patent_cpc_groups)):
        return []
    backward_citations = citations_dict.get(patent_id, [])
    if not backward_citations:
        return []
    backward_citations_cpc_groups = []
    for citation in backward_citations:
        citation_cpc_groups = cpc_groups_dict.get(citation, {})
        if not citation_cpc_groups or (isinstance(citation_cpc_groups, float) and np.isnan(citation_cpc_groups)):
            continue
        if citation_cpc_groups:
            backward_citations_cpc_groups.extend(citation_cpc_groups)
    if not backward_citations_cpc_groups:
        return []
    return list(set(patent_cpc_groups) & set(backward_citations_cpc_groups))

patent_cpc_subclass_pairs_with_atypicality["intersect_cpc_groups"] = patent_cpc_subclass_pairs_with_atypicality.apply(get_intersect_cpc_groups, axis=1)
patent_cpc_subclass_pairs_with_atypicality.head()

Unnamed: 0_level_0,cpc_subclass_pairs,atypicality_score,cpc_subclasses,intersect_cpc_groups
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000000,[],,{G01S},[]
10000001,"[(B29C, G05B)]",8.984207,"{G05B, B29C}","[B29C2945/76083, B29C45/1751, B29C45/76]"
10000002,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",8.37534,"{B29D, B29K, B29L, B32B, B60C, B29C}","[B32B27/08, B29C48/21, B32B27/34]"
10000003,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",6.990229,"{B29D, B29L, B29C, B29K}","[B29L2031/7172, B29C2049/4881, B29C2049/2017, ..."
10000004,"[(B29C, B29K), (B29C, B29L), (B29K, B29L)]",6.23967,"{B29K, B29L, B29C}",[B29C51/14]


In [82]:
# 4. Get the union of CPC groups between a patent and its backward citations
def get_union_cpc_groups(row):
    patent_id = row.name
    patent_cpc_groups = cpc_groups_dict.get(patent_id, {})
    if not patent_cpc_groups or (isinstance(patent_cpc_groups, float) and np.isnan(patent_cpc_groups)):
        return []
    backward_citations = citations_dict.get(patent_id, [])
    if not backward_citations:
        return patent_cpc_groups
    backward_citations_cpc_groups = []
    for citation in backward_citations:
        citation_cpc_groups = cpc_groups_dict.get(citation, {})
        if not citation_cpc_groups or (isinstance(citation_cpc_groups, float) and np.isnan(citation_cpc_groups)):
            continue
        if citation_cpc_groups:
            backward_citations_cpc_groups.extend(citation_cpc_groups)
    return list(set(patent_cpc_groups) | set(backward_citations_cpc_groups))

patent_cpc_subclass_pairs_with_atypicality["union_cpc_groups"] = patent_cpc_subclass_pairs_with_atypicality.apply(get_union_cpc_groups, axis=1)
patent_cpc_subclass_pairs_with_atypicality.head()

Unnamed: 0_level_0,cpc_subclass_pairs,atypicality_score,cpc_subclasses,intersect_cpc_groups,union_cpc_groups
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000000,[],,{G01S},[],"[G01S17/89, G01S17/95, G01S7/4917, G01S7/4865,..."
10000001,"[(B29C, G05B)]",8.984207,"{G05B, B29C}","[B29C2945/76083, B29C45/1751, B29C45/76]","[B29C2945/76096, B29C45/1751, B29C45/76, G05B2..."
10000002,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",8.37534,"{B29D, B29K, B29L, B32B, B60C, B29C}","[B32B27/08, B29C48/21, B32B27/34]","[C09J179/08, B29C48/07, B32B27/285, B29C41/28,..."
10000003,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",6.990229,"{B29D, B29L, B29C, B29K}","[B29L2031/7172, B29C2049/4881, B29C2049/2017, ...","[B29C49/04, B29K2101/12, B29C2791/007, B29C204..."
10000004,"[(B29C, B29K), (B29C, B29L), (B29K, B29L)]",6.23967,"{B29K, B29L, B29C}",[B29C51/14],"[B32B37/02, B32B2323/10, B29C51/082, B32B2439/..."


In [83]:
# 5. Calculate the Technological Leap Score
# Technological Leap Score = 1 - (Jaccard Similarity)
def calculate_tech_leap_score(patent):
    # If a patent and all its backward citations have no CPC groups, this means it has no technological leap.
    if not patent["union_cpc_groups"]:
        return 0.0
    # If a patent has no CPC subclasses in common with its backward citations, it has maximum technological leap.
    elif not patent["intersect_cpc_groups"]:
        return 1.0
    else:
        return 1 - (len(patent["intersect_cpc_groups"]) / len(patent["union_cpc_groups"]))
patent_cpc_subclass_pairs_with_atypicality["tech_leap"] = patent_cpc_subclass_pairs_with_atypicality.apply(calculate_tech_leap_score, axis=1)
patent_cpc_subclass_pairs_with_atypicality.head(10)

Unnamed: 0_level_0,cpc_subclass_pairs,atypicality_score,cpc_subclasses,intersect_cpc_groups,union_cpc_groups,tech_leap
patent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000000,[],,{G01S},[],"[G01S17/89, G01S17/95, G01S7/4917, G01S7/4865,...",1.0
10000001,"[(B29C, G05B)]",8.984207,"{G05B, B29C}","[B29C2945/76083, B29C45/1751, B29C45/76]","[B29C2945/76096, B29C45/1751, B29C45/76, G05B2...",0.90625
10000002,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",8.37534,"{B29D, B29K, B29L, B32B, B60C, B29C}","[B32B27/08, B29C48/21, B32B27/34]","[C09J179/08, B29C48/07, B32B27/285, B29C41/28,...",0.949153
10000003,"[(B29C, B29D), (B29C, B29K), (B29C, B29L), (B2...",6.990229,"{B29D, B29L, B29C, B29K}","[B29L2031/7172, B29C2049/4881, B29C2049/2017, ...","[B29C49/04, B29K2101/12, B29C2791/007, B29C204...",0.818182
10000004,"[(B29C, B29K), (B29C, B29L), (B29K, B29L)]",6.23967,"{B29K, B29L, B29C}",[B29C51/14],"[B32B37/02, B32B2323/10, B29C51/082, B32B2439/...",0.980769
10000005,"[(B29C, Y10T)]",6.483157,"{B29C, Y10T}","[B29C51/262, B29C51/04, B29C51/082]","[B29C51/262, B29C51/082, B29L2031/3005, Y10S42...",0.75
10000006,"[(B29C, B29K), (B29C, B29L), (B29C, B60R), (B2...",7.613158,"{B29K, B29L, Y10T, B60R, B29C}",[],"{B60R13/02, B29C51/082, B29K2105/256, B29K2101...",1.0
10000007,"[(B29C, B29K), (B29C, B29L), (B29K, B29L)]",6.23967,"{B29K, B29L, B29C}","[B29K2023/0691, B29L2023/22, B29C57/04]","[B21D17/02, F15B20/001, Y10T29/49902, Y10T29/5...",0.968085
10000008,"[(A44C, B29C), (A44C, B29K), (A44C, B29L), (B2...",9.454074,"{A44C, B29L, B29C, B29K}",[],"[Y10S425/044, B29C33/306, B22C9/22, B44C5/00, ...",1.0
10000009,"[(B29C, B29L)]",5.978974,"{B29L, B29C}","[B29C64/106, B29C64/112]","[H02K7/08, B29C2793/009, B29C64/182, H02K2201/...",0.923077


In [84]:
# 6. Save the final DataFrame to a CSV file
patent_cpc_subclass_pairs_with_atypicality.to_csv("./data/patents_with_atypicality_and_tech_leap.csv")

In [85]:
patent_cpc_subclass_pairs_with_atypicality["tech_leap"].describe()

count    7.507819e+06
mean     8.129112e-01
std      3.071087e-01
min      0.000000e+00
25%      8.285714e-01
50%      9.375000e-01
75%      1.000000e+00
max      1.000000e+00
Name: tech_leap, dtype: float64