In [10]:
import dask.dataframe as dd

In [11]:
dtypes = {
    "patent_id": "string",
    "forward_citations": "int64",
}
fwd_citations = dd.read_csv("./data/final_fwdcitation.csv", dtype=dtypes)

fwd_citations.head()

Unnamed: 0,patent_id,patent_type,patent_date,patent_title,wipo_kind,forward_citations
0,10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,B2,13
1,10000001,utility,2018-06-19,Injection molding machine and mold thickness c...,B2,0
2,10000002,utility,2018-06-19,Method for manufacturing polymer film and co-e...,B2,0
3,10000003,utility,2018-06-19,Method for producing a container from a thermo...,B2,2
4,10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, c...",B2,0


In [12]:
dtypes = {
    "patent_id": "string",
    "cpc_sequence": "int64",
}
cpc = dd.read_csv(
    "./data/g_cpc_current.tsv", 
    sep="\t",
    dtype=dtypes,
    )
cpc.head()

Unnamed: 0,patent_id,cpc_sequence,cpc_section,cpc_class,cpc_subclass,cpc_group,cpc_type
0,3950000,0,A,A63,A63C,A63C9/001,inventional
1,3950000,1,A,A63,A63C,A63C9/00,inventional
2,3950000,2,A,A63,A63C,A63C9/002,inventional
3,3950000,3,A,A63,A63C,A63C9/081,inventional
4,3950001,0,A,A63,A63C,A63C9/086,inventional


In [13]:
citations_dtypes = {
    "patent_id": "string",
    "citation_patent_id": "string",
}
citations = dd.read_csv(
    "./data/g_us_patent_citation.tsv", 
    sep="\t",
    dtype=citations_dtypes,
    )
citations.head()

Unnamed: 0,patent_id,citation_sequence,citation_patent_id,citation_date,record_name,wipo_kind,citation_category
0,10000000,0,5093563,1992-03-01,Small,A,cited by examiner
1,10000000,1,5751830,1998-05-01,Hutchinson,A,cited by applicant
2,10000001,0,7804268,2010-09-01,Park,B2,cited by examiner
3,10000001,1,9022767,2015-05-01,Oono,B2,cited by examiner
4,10000001,2,9090016,2015-07-01,Takeuchi,B2,cited by examiner


In [14]:
fwd_citations = fwd_citations[["patent_id", "forward_citations"]]
cpc = cpc[["patent_id", "cpc_sequence", "cpc_group"]]
citations = citations[["patent_id", "citation_sequence", "citation_patent_id"]]

In [15]:
# Count the number of occurences of each unique pair of cpc_group accross all patents
from itertools import combinations
import os

# Step 1: Group by patent_id and collect all cpc_groups for each patent
# This will create a dataframe where each patent_id maps to a list of its cpc_groups
cpc_groups = cpc.groupby("patent_id")["cpc_group"].apply(list, meta=('cpc_group', 'object'))



In [16]:
# Step 2: Generate all unique pairs of cpc_group for each patent
def generate_pairs(cpc_group_list):
    return list(combinations(sorted(cpc_group_list), 2))

pairs = []
if not os.path.exists("./data/cpc_group_pairs.csv"):
    pairs = cpc_groups.map(generate_pairs, meta=('pairs', 'object'))
    pairs.to_csv("./data/cpc_group_pairs.csv", single_file=True)
else:
    pairs = dd.read_csv("./data/cpc_group_pairs.csv", dtype={'pairs': 'object'})
pairs.head()

patent_id
10000025    [(B29C70/202, B29C70/30), (B29C70/202, B29K210...
10000066    [(B41J2/14274, B41J2/175), (B41J2/14274, B41J2...
10000072    [(B41J11/008, B41J2/01), (B41J11/008, B41J3/40...
10000096                                                   []
10000098    [(B60C23/0433, B60C23/044), (B60C23/0433, B60C...
Name: cpc_group, dtype: object

In [17]:
# Step 3: Flatten the list of pairs into a DataFrame
pairs_df = pairs.explode().dropna().to_frame(name="pair")
pairs_df.head()

Unnamed: 0_level_0,pair
patent_id,Unnamed: 1_level_1
10000025,"(B29C70/202, B29C70/30)"
10000025,"(B29C70/202, B29K2105/0881)"
10000025,"(B29C70/202, B29L2031/3076)"
10000025,"(B29C70/202, B29L2031/3085)"
10000025,"(B29C70/202, B32B2307/544)"


In [23]:
# Step 4: Count occurrences of each unique pair
pair_counts = []
if not os.path.exists("./data/cpc_group_pair_counts.csv"):
    pair_counts = pairs_df.groupby("pair").size().compute()
    pair_counts.to_csv("./data/cpc_group_pair_counts.csv", single_file=True)
else:
    pair_counts = dd.read_csv("./data/cpc_group_pairs.csv", dtype={'pair': 'object'})

pair_counts.head()

KeyboardInterrupt: 

In [None]:
pair_counts.head()

Unnamed: 0,patent_id,cpc_group
0,10000025,"[('B29C70/202', 'B29C70/30'), ('B29C70/202', '..."
1,10000066,"[('B41J2/14274', 'B41J2/175'), ('B41J2/14274',..."
2,10000072,"[('B41J11/008', 'B41J2/01'), ('B41J11/008', 'B..."
3,10000096,[]
4,10000098,"[('B60C23/0433', 'B60C23/044'), ('B60C23/0433'..."


In [None]:
# Step 5: Merge the pair counts with the original cpc dataframe

pairs_with_counts = pairs_df.merge(pair_counts.rename('count'), on='pair', how='left')
pairs_with_counts.head()

ValueError: Cannot rename index.