In [2]:
import dask.dataframe as dd

In [3]:
dtypes = {
    "patent_id": "string",
    "forward_citations": "int64",
}
fwd_citations = dd.read_csv("./data/final_fwdcitation.csv", dtype=dtypes)

fwd_citations.head()

Unnamed: 0,patent_id,patent_type,patent_date,patent_title,wipo_kind,forward_citations
0,10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,B2,13
1,10000001,utility,2018-06-19,Injection molding machine and mold thickness c...,B2,0
2,10000002,utility,2018-06-19,Method for manufacturing polymer film and co-e...,B2,0
3,10000003,utility,2018-06-19,Method for producing a container from a thermo...,B2,2
4,10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, c...",B2,0


In [4]:
dtypes = {
    "patent_id": "string",
    "cpc_sequence": "int64",
}
cpc = dd.read_csv(
    "./data/g_cpc_current.tsv", 
    sep="\t",
    dtype=dtypes,
    )
cpc.head()

Unnamed: 0,patent_id,cpc_sequence,cpc_section,cpc_class,cpc_subclass,cpc_group,cpc_type
0,3950000,0,A,A63,A63C,A63C9/001,inventional
1,3950000,1,A,A63,A63C,A63C9/00,inventional
2,3950000,2,A,A63,A63C,A63C9/002,inventional
3,3950000,3,A,A63,A63C,A63C9/081,inventional
4,3950001,0,A,A63,A63C,A63C9/086,inventional


In [5]:
citations_dtypes = {
    "patent_id": "string",
    "citation_patent_id": "string",
}
citations = dd.read_csv(
    "./data/g_us_patent_citation.tsv", 
    sep="\t",
    dtype=citations_dtypes,
    )
citations.head()

Unnamed: 0,patent_id,citation_sequence,citation_patent_id,citation_date,record_name,wipo_kind,citation_category
0,10000000,0,5093563,1992-03-01,Small,A,cited by examiner
1,10000000,1,5751830,1998-05-01,Hutchinson,A,cited by applicant
2,10000001,0,7804268,2010-09-01,Park,B2,cited by examiner
3,10000001,1,9022767,2015-05-01,Oono,B2,cited by examiner
4,10000001,2,9090016,2015-07-01,Takeuchi,B2,cited by examiner


In [6]:
fwd_citations = fwd_citations[["patent_id", "forward_citations"]]
cpc = cpc[["patent_id", "cpc_sequence", "cpc_subclass"]]
citations = citations[["patent_id", "citation_sequence", "citation_patent_id"]]

In [17]:
num_patents_in_cpc = cpc["patent_id"].nunique().compute()
print(f"Number of patents in CPC: {num_patents_in_cpc}")

Number of patents in CPC: 8263827


In [75]:
# Count the number of occurences of each unique pair of cpc_subclasse accross all patents
from itertools import combinations
import os

# Step 1: Group by patent_id and collect all cpc_subclasses for each patent
# This will create a dataframe where each patent_id maps to a list of its cpc_subclasses
cpc_subclasses = cpc.groupby("patent_id")["cpc_subclass"].apply(set, meta=('cpc_subclasses', 'object'))
cpc_subclasses.head()


patent_id
10000025    {Y10T, B29C, B29K, B29L, Y02T, B32B}
10000066                                  {B41J}
10000072                            {B41J, H04N}
10000096                                  {B60C}
10000098                                  {B60C}
Name: cpc_subclasses, dtype: object

In [8]:
import ast
# Step 2: Generate all unique pairs of cpc_subclasse for each patent
def generate_pairs(cpc_subclasse_list):
    return list(combinations(sorted(cpc_subclasse_list), 2))

patent_cpc_subclass_pairs = None
if not os.path.exists("./data/cpc_subclass_pairs.csv"):
    patent_cpc_subclass_pairs = cpc_subclasses.map(generate_pairs, meta=('pairs', 'object'))
    patent_cpc_subclass_pairs = patent_cpc_subclass_pairs.reset_index()
    patent_cpc_subclass_pairs.columns = ["patent_id", "cpc_subclass_pairs"]
    patent_cpc_subclass_pairs.to_csv("./data/cpc_subclass_pairs.csv", single_file=True)
else:
    patent_cpc_subclass_pairs = dd.read_csv("./data/cpc_subclass_pairs.csv", header=0, names=["patent_id", "cpc_subclass_pairs"])
    patent_cpc_subclass_pairs["cpc_subclass_pairs"] = patent_cpc_subclass_pairs["cpc_subclass_pairs"].apply(ast.literal_eval, meta=('cpc_subclass_pairs', 'object'))
patent_cpc_subclass_pairs.head()

Unnamed: 0,patent_id,cpc_subclass_pairs
0,10000025,"[(B29C, B29K), (B29C, B29L), (B29C, B32B), (B2..."
1,10000066,[]
2,10000072,"[(B41J, H04N)]"
3,10000096,[]
4,10000098,[]


In [9]:
# Step 3: Flatten the list of pairs into a DataFrame
patent_cpc_subclass_pairs_long = patent_cpc_subclass_pairs.explode(column="cpc_subclass_pairs").dropna()
patent_cpc_subclass_pairs_long = patent_cpc_subclass_pairs_long.rename(columns={"cpc_subclass_pairs": "cpc_subclass_pair"})
patent_cpc_subclass_pairs_long = patent_cpc_subclass_pairs_long.reset_index(drop=True)
patent_cpc_subclass_pairs_long.head()

Unnamed: 0,patent_id,cpc_subclass_pair
0,10000025,"(B29C, B29K)"
1,10000025,"(B29C, B29L)"
2,10000025,"(B29C, B32B)"
3,10000025,"(B29C, Y02T)"
4,10000025,"(B29C, Y10T)"


In [10]:
# Step 4: Count occurrences of each unique pair
if not os.path.exists("./data/cpc_subclass_pair_counts.csv"):
    pair_counts = patent_cpc_subclass_pairs_long.groupby("cpc_subclass_pair").size().compute()
    pair_counts = pair_counts.to_frame(name="count").reset_index()
    pair_counts.columns = ["cpc_subclass_pair", "count"]
    pair_counts.to_csv("./data/cpc_subclass_pair_counts.csv")
else:
    pair_counts = dd.read_csv("./data/cpc_subclass_pair_counts.csv", header=0, names=["cpc_subclass_pair", "count"])

pair_counts.head()

Unnamed: 0,cpc_subclass_pair,count
0,"(A01B, A01C)",1921
1,"(A01B, A01D)",1437
2,"(A01B, A01F)",145
3,"(A01B, A01G)",516
4,"(A01B, A01K)",39


In [11]:
total_pairs = pair_counts["count"].sum()
print(f"Total number of pairs in the dataset: {total_pairs}")

num_unique_pairs = pair_counts["cpc_subclass_pair"].nunique()
print(f"Number of unique pairs in the dataset: {num_unique_pairs}")

Total number of pairs in the dataset: 16687575
Number of unique pairs in the dataset: 100561


In [12]:
import numpy as np
# Step 5: Assign each pair an atypicality score
def atypicality_score(n):
    if n > 0:
        return -np.log(n)
    else:
        # This should not happen since the pairs are generated from existing data. If it does, raise an error.
        raise ValueError("Count must be greater than 0")

if not os.path.exists("./data/cpc_subclass_pair_counts_with_atypicality.csv"):
    pair_counts["atypicality_score"] = pair_counts["count"].apply(lambda x: atypicality_score(x))
    pair_counts.to_csv("./data/cpc_subclass_pair_counts_with_atypicality.csv")
else: 
    pair_counts = dd.read_csv("./data/cpc_subclass_pair_counts_with_atypicality.csv")
pair_counts.head()




Unnamed: 0,cpc_subclass_pair,count,atypicality_score
0,"(A01B, A01C)",1921,-7.560601
1,"(A01B, A01D)",1437,-7.270313
2,"(A01B, A01F)",145,-4.976734
3,"(A01B, A01G)",516,-6.246107
4,"(A01B, A01K)",39,-3.663562


In [13]:
pair_counts["atypicality_score"].describe()

count    100561.000000
mean         -2.424172
std           1.993113
min         -12.044647
25%          -3.688879
50%          -2.079442
75%          -0.693147
max          -0.000000
Name: atypicality_score, dtype: float64

In [14]:
pair_counts_dict = pair_counts.set_index("cpc_subclass_pair").to_dict(orient="index")

In [15]:
# Step 6: Calculate the atypicality score for each patent
def calculate_atypicality_score(cpc_group_pairs):
    if isinstance(cpc_group_pairs, str):
        cpc_group_pairs = ast.literal_eval(cpc_group_pairs)

    if len(cpc_group_pairs) == 0:
        return 0.0
    # Get the counts for each pair
    scores = [pair_counts_dict.get(pair)["atypicality_score"] for pair in cpc_group_pairs]
    if len(scores) == 0:
        raise RuntimeError("No matching pairs found for the given cpc_group_pairs.")
    
    # Calculate the atypicality score for each pair
    atypicality_score = np.mean(scores)
    
    return atypicality_score

# Group by patent_id and calculate the atypicality score for each patent
patent_cpc_subclass_pairs["atypicality_score"] = patent_cpc_subclass_pairs.apply(
    lambda patent: calculate_atypicality_score(patent["cpc_subclass_pairs"]),
    meta=('atypicality_score', 'float64'),
    axis=1
)

pairs = patent_cpc_subclass_pairs[["patent_id", "cpc_subclass_pairs", "atypicality_score"]]
pairs.head()

Unnamed: 0,patent_id,cpc_subclass_pairs,atypicality_score
0,10000025,"[(B29C, B29K), (B29C, B29L), (B29C, B32B), (B2...",-9.071478
1,10000066,[],0.0
2,10000072,"[(B41J, H04N)]",-8.659387
3,10000096,[],0.0
4,10000098,[],0.0


In [79]:
cpc_subclasses = cpc_subclasses.reset_index()
cpc_subclasses.columns = ["patent_id", "cpc_subclasses"]
cpc_subclasses.head()

Index(['index', 'cpc_subclasses'], dtype='object')

In [82]:
pairs = patent_cpc_subclass_pairs[["patent_id", "cpc_subclass_pairs", "atypicality_score"]]
pairs = pairs.merge(cpc_subclasses, how="right", on="patent_id")
pairs.head()

Unnamed: 0,patent_id,cpc_subclass_pairs,atypicality_score,cpc_subclasses
0,10000025,"[[B29C, B29K], [B29C, B29L], [B29C, B32B], [B2...",-9.071478,"{Y10T, B29C, B29K, B29L, Y02T, B32B}"
1,10000066,[],0.0,{B41J}
2,10000072,"[[B41J, H04N]]",-8.659387,"{B41J, H04N}"
3,10000096,[],0.0,{B60C}
4,10000098,[],0.0,{B60C}


In [83]:
pairs.to_csv("./data/patents_with_atypicality.csv", single_file=True)

['/projectnb/glob-s/Khoa/patents/data/patents_with_atypicality.csv']

In [84]:
num_patents = pairs.shape[0].compute()
print(f"Number of patents in the output dataset: {num_patents}")

Number of patents in the output dataset: 8263827


# Technological Leap Score

## Steps:
1. Write the list of CPC groups for the new patent
2. Write the list of CPC groups for all patents it cites
3. Count the number of overlapping CPC groups between these two lists
4. Count the number of total CPC groups in the new patent and all patents it cites (union of both lists)
5. Similarity score = 
$$\text{Similarity Score} = \frac{\text{Number of Overlapping CPC Groups}}{\text{Total CPC Groups in New Patent and Cited Patents}}$$
6. Technological Leap Score = 1 - Similarity Score

In [None]:
# 1. Count the number of overlapping CPC groups between each patent and its forward citations
def get_intersect_cpc_subclasses(patent_id):
    patent_cpc_subclasses = cpc_subclasses[cpc_subclasses["patent_id"] == patent_id]["cpc_group"].compute().tolist()
    if not patent_cpc_subclasses:
        return []
    fwd_citations = fwd_citations[fwd_citations["patent_id"] == patent_id]["forward_citations"].compute().tolist()
    if not fwd_citations:
        return []
    fwd_citations_cpc_subclasses = cpc[cpc["patent_id"].isin(fwd_citations)]["cpc_group"].compute().tolist()
    if not fwd_citations_cpc_subclasses:
        return []
    return set(patent_cpc_subclasses) & set(fwd_citations_cpc_subclasses)

patents = fwd_citations["patent_id"].unique()
patents["cpc_group_overlap"] = patents.apply(
    lambda patent_id: get_intersect_cpc_subclasses(patent_id),
    meta=('cpc_group_overlap', 'object')
)





In [None]:
def get_union_cpc_subclasses(patent_id):
    patent_cpc_subclasses = cpc_subclasses[cpc_subclasses["patent_id"] == patent_id]["cpc_group"].compute().tolist()
    if not patent_cpc_subclasses:
        return 0
    fwd_citations = fwd_citations[fwd_citations["patent_id"] == patent_id]["forward_citations"].compute().tolist()
    if not fwd_citations:
        return 0
    fwd_citations_cpc_subclasses = cpc[cpc["patent_id"].isin(fwd_citations)]["cpc_group"].compute().tolist()
    if not fwd_citations_cpc_subclasses:
        return 0
    return set(patent_cpc_subclasses) | set(fwd_citations_cpc_subclasses)


patents["cpc_group_union"] = patents.apply(
    lambda patent_id: get_union_cpc_subclasses(patent_id),
    meta=('cpc_group_union', 'object')
)
patents.head()

In [None]:
def calculate_jaccard_similarity(patent):
    if len(patent["cpc_group_union"]) == 0:
        return 0.0
    return len(patent["cpc_group_overlap"]) / len(patent["cpc_subclasse_union"])

patents["jaccard_similarity"] = patents.apply(
    lambda patent: calculate_jaccard_similarity(patent),
    meta=('jaccard_similarity', 'float64')
)
patents.head()

In [None]:
def calculate_technological_leap_score(patent):
    return 1 - patent["jaccard_similarity"]

patents["technological_leap_score"] = patents.apply(
    lambda patent: calculate_technological_leap_score(patent),
    meta=('technological_leap_score', 'float64')
)
patents.to_csv("./data/patents_with_technological_leap_score.csv", single_file=True)
patents.head()