# Generational Difference
For patents that cite each other, a significant question is how the characteristic of the prior generation "cited generation" affect the behaviour of future patents that cite them. 

1. Assignees - Number of Unique Assignees
1. Inventors - Number of Unique Inventors
1. Locations - Number of Unique Locations
1. Patents - Number of Unique Patents
1. NBER Category - Number of NBER Category
1. Jason-Shannon Divergence
1. Similarity 
1. Claims 

In [1]:
import neo4j 
import pandas as pd
import random
import numpy as np
import datetime
from credentials import uri, user, pwd
from patent_neo4j.connection import Neo4jConnection
from patent_neo4j.analysis import assign_missing_nber, nber_distribution, js_divergence

In [2]:
df = pd.read_csv('Data/Mined Data/sample_patents_stats.csv')

In [12]:
df.head()

Unnamed: 0,id,num_edge,num_patents,edge_density,unq_assignees,unq_inventors,unq_loc,avg_claims,avg_sim
0,4962875,9,9,1.0,6,25,6,16.555556,0.23215
1,6207286,4,4,1.0,3,10,2,8.0,0.345882
2,4651564,45578,19149,2.380177,3083,22246,1746,20.104657,0.243059
3,5291810,63,52,1.211538,34,79,38,15.27451,0.261469
4,4372077,1829,1251,1.46203,545,1670,421,20.593424,0.247469


Connection for citation tree queries

In [13]:
# "standard" root for looking at how things behave
root = df.loc[4,'id']
# "degenerate" as in only 1 child
degenerate_root = df.loc[643,'id']
conn = Neo4jConnection(uri, user, pwd)
citation_tree = conn.query_citation_tree(root)
degen_tree = conn.query_citation_tree(degenerate_root)

In [14]:
citation_tree.head()

Unnamed: 0,id,date,country,claims,kind,assignee,location,inventor,lineage,similarity,nber_lineage
0,4865574,1989-09-12,US,2,A,85c7d672-dd58-4150-ac55-261c1419ca2a,6c9ccb4a-791d-11eb-bfee-121df0c29c1e,fl:t_ln:kobayashi-293,[4372077],[0.3360787630081177],[62]
1,5967874,1999-10-19,US,12,A,4e724615-180a-47d4-87cb-e6875633dbcf,231f75ab-7920-11eb-bfee-121df0c29c1e,fl:k_ln:chan-147,"[4865574, 4372077]","[0.2402147352695465, 0.3360787630081177]","[62, 62]"
2,5967874,1999-10-19,US,12,A,4e724615-180a-47d4-87cb-e6875633dbcf,231f75ab-7920-11eb-bfee-121df0c29c1e,fl:s_ln:shaw-26,"[4865574, 4372077]","[0.2402147352695465, 0.3360787630081177]","[62, 62]"
3,6210248,2001-04-03,US,20,A,9cc2d795-48c8-4b7d-9114-d6b6bc8c6958,cd1ba8a1-791d-11eb-bfee-121df0c29c1e,fl:c_ln:mcadam-1,"[5967874, 4865574, 4372077]","[0.3331000804901123, 0.2402147352695465, 0.336...","[62, 62, 62]"
4,5746637,1998-05-05,US,36,A,de6b86c2-2174-4a9b-b517-3f3d57125d3f,cf1ac6eb-791f-11eb-bfee-121df0c29c1e,fl:w_ln:hunt-23,"[4865574, 4372077]","[0.3883195817470551, 0.3360787630081177]","[62, 62]"


In [15]:
citation_tree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4943 entries, 0 to 4942
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            4943 non-null   object
 1   date          4931 non-null   object
 2   country       4931 non-null   object
 3   claims        4931 non-null   object
 4   kind          4931 non-null   object
 5   assignee      4943 non-null   object
 6   location      4943 non-null   object
 7   inventor      4943 non-null   object
 8   lineage       4943 non-null   object
 9   similarity    4943 non-null   object
 10  nber_lineage  4943 non-null   object
dtypes: object(11)
memory usage: 424.9+ KB


In [16]:
degen_tree.head()

Unnamed: 0,id,date,country,claims,kind,assignee,location,inventor,lineage,similarity,nber_lineage
0,8469200,2013-06-25,US,23,B2,05a3efaa-d144-48c7-be32-ccffcab5c1e1,6c9ccb4a-791d-11eb-bfee-121df0c29c1e,fl:f_ln:wendeln-1,[4714397],[0.14073023200035095],[51]


### Obtain Only the "Last Generation" as the forefront of Invention
Patents in the citation tree could potentially cite their grandparents, this causes a patent to **potentially be classified in different generations**. It is also possible that some patents might cite a generation 1 and generation 2 patent, this places them in both generations 2 and 3. <br>

Defining generation as a form of a 'forefront' of the technology development, we care about the **latest generation** to classify the specific invention as the **frontier**. 

In [17]:
'''
Given (RAW) citation_tree, keep only the "oldest" generation
i.e. if a is gen 1 and 2, gen will be only 2
Also, take the direct simiarity
Input:
    citation_tree
Output:
    citation_tree
'''
def get_max_generation(citation_tree):
    # Obtain the generation based on lineage
    citation_tree['gen'] = citation_tree['similarity'].apply(lambda x: len(x))
    
    # Dropping duplicates due to different inventors
    generation = citation_tree.loc[:,['id','gen']].drop_duplicates()
    
    # Sort values based on a generation, keeping the last (the idea of the forefront of inventions)
    generation = generation.sort_values(by=['gen']).drop_duplicates(subset=['id'], keep='last')
    
    # Left join with generation -> this only keeps the max(gen) for each patent
    citation_tree = pd.merge(generation,citation_tree,on=['id','gen'], how='left')
    
    # Take the direct similarity of the max(gen)
    citation_tree['similarity'] = citation_tree['similarity'].apply(lambda x: x[0])
    
    return citation_tree

In [18]:
def fixing_na_nber(citation_tree):
    
    # Get NBER, similarity and lineage
    citation_tree['nber'] = citation_tree['nber_lineage'].apply(lambda x: x[0])
    
    # Return Assigned NBER
    citation_tree = pd.merge(citation_tree.drop(['nber'],axis=1),assign_missing_nber(citation_tree), on='id', how='left')
    
    return citation_tree

In [19]:
citation_tree = fixing_na_nber(citation_tree)
degen_tree = fixing_na_nber(degen_tree)

In [20]:
citation_tree = get_max_generation(citation_tree)
degen_tree = get_max_generation(degen_tree)

In [21]:
citation_tree.head()

Unnamed: 0,id,gen,date,country,claims,kind,assignee,location,inventor,lineage,similarity,nber_lineage,nber,hops
0,4865574,1,1989-09-12,US,2,A,85c7d672-dd58-4150-ac55-261c1419ca2a,6c9ccb4a-791d-11eb-bfee-121df0c29c1e,fl:t_ln:kobayashi-293,[4372077],0.336079,[62],62,1
1,4712673,1,1987-12-15,US,17,A,793c7320-c632-4a58-a806-a438fadd378b,8887cf46-791f-11eb-bfee-121df0c29c1e,fl:r_ln:moore-39,[4372077],0.263919,[62],62,1
2,4925423,1,1990-05-15,US,7,A,0df3cb01-4937-4bba-a4e9-6d6b4b96a553,33378cbb-7920-11eb-bfee-121df0c29c1e,fl:l_ln:miller-228,[4372077],0.418019,[62],62,1
3,4925423,1,1990-05-15,US,7,A,0df3cb01-4937-4bba-a4e9-6d6b4b96a553,33378cbb-7920-11eb-bfee-121df0c29c1e,0bylm8qum347dggd9q7opsfzt,[4372077],0.418019,[62],62,1
4,5827103,1,1998-10-27,US,1,A,7ec52217-1e18-4fa2-955d-da2b9ea47b45,5d99f20e-7920-11eb-bfee-121df0c29c1e,fl:s_ln:carter-49,[4372077],0.439855,[62],62,1


In [22]:
degen_tree.head()

Unnamed: 0,id,gen,date,country,claims,kind,assignee,location,inventor,lineage,similarity,nber_lineage,nber,hops
0,8469200,1,2013-06-25,US,23,B2,05a3efaa-d144-48c7-be32-ccffcab5c1e1,6c9ccb4a-791d-11eb-bfee-121df0c29c1e,fl:f_ln:wendeln-1,[4714397],0.14073,[51],51,1


### Counting the Numbers
For some of the features, and as for now we have:

1. Inventors
1. Assignees
1. Location
1. Patent ID

We are interested in how many of them are in a given citation tree for each generation, and I refer to them as **countables**

In [23]:
"""
Takes a citation_tree, and given an optional parameter countables, count the number of unique countables 
(whatever they are) by generation
Input:
    citation_tree and countables
Output:
    generation - dataframe with counts of countables by generation
"""
def counting_countables(citation_tree, countables=['inventor','assignee','location','id']):
    
    counter = lambda x: citation_tree.loc[:,['gen',x]].drop_duplicates().groupby("gen").agg("count").reset_index()
    
    generation = pd.DataFrame({'gen':[1,2,3]})
    
    
    for c in countables:
        generation = pd.merge(generation, counter(c), how='left', on='gen')
        
    return generation

In [24]:
counting_countables(citation_tree).head()

Unnamed: 0,gen,inventor,assignee,location,id
0,1,5,4,4,4
1,2,54,31,31,38
2,3,1621,519,402,1209


In [25]:
counting_countables(degen_tree).head()

Unnamed: 0,gen,inventor,assignee,location,id
0,1,1.0,1.0,1.0,1.0
1,2,,,,
2,3,,,,


### Averaging the values

Taking the average values by generation, we have columns that are:
1. Similarity
1. Claims

And these are the **averageables**

In [26]:
'''
Takes a citation_tree, and given column averageables that take the average for each
by generation. 
Inputs:
    citation_tree
    averageables - list of columns that are 'averageable'
Output:
    generational information
'''
def averaging_averageables(citation_tree, averageables = ['similarity','claims']):
    # Averaging function that drops duplicates 
    # Drops NA rows and ensure all are of float64
    # Then average by generation LoL
    averager = lambda x: citation_tree.loc[:,['gen']+x].drop_duplicates().dropna().astype('float64').groupby("gen").agg("mean").reset_index()
    
    generation = pd.DataFrame({'gen':[1,2,3]})
    
    generation = pd.merge(generation, averager(averageables), how='left', on='gen')
    
    return generation

In [27]:
averaging_averageables(citation_tree)

Unnamed: 0,gen,similarity,claims
0,1,0.364468,6.75
1,2,0.326949,16.525
2,3,0.243972,20.86419


In [28]:
averaging_averageables(degen_tree)

Unnamed: 0,gen,similarity,claims
0,1,0.14073,23.0
1,2,,
2,3,,


### Putting Things Together
Just boring, putting both averageables together, based on the root, so we know who it belongs to

In [20]:
citation_generation = pd.merge(counting_countables(citation_tree),averaging_averageables(citation_tree))

In [21]:
citation_generation['root'] = root

In [22]:
citation_generation.head()

Unnamed: 0,gen,inventor,assignee,location,id,similarity,claims,root
0,1,5,4,4,4,0.364468,6.75,4372077
1,2,54,31,31,38,0.326949,16.525,4372077
2,3,1621,519,402,1209,0.243972,20.86419,4372077


In [23]:
degen_generation = pd.merge(counting_countables(degen_tree),averaging_averageables(degen_tree))

In [24]:
degen_generation['root'] = degenerate_root

In [25]:
degen_generation.head()

Unnamed: 0,gen,inventor,assignee,location,id,similarity,claims,root
0,1,1.0,1.0,1.0,1.0,0.14073,23.0,4714397
1,2,,,,,,,4714397
2,3,,,,,,,4714397


In [26]:
degen_generation.columns

Index(['gen', 'inventor', 'assignee', 'location', 'id', 'similarity', 'claims',
       'root'],
      dtype='object')

## Awful and Sad Loops
This is the part where I hate myself, and my laptop would hate me even more because I am abusing it. BUT, whatever, I couldn't care less.

In [33]:
sad_loop_to_go_around = list(df['id'])

In [29]:
def generational_information(root):
    # Query Citation Tree
    citation_tree = conn.query_citation_tree(root)
    
    # Get Max Generation and Clean Data
    citation_tree = get_max_generation(citation_tree)
    
    # Assign NBER
    citation_tree = fixing_na_nber(citation_tree)
    
    # Count and Average Data
    citation_tree = pd.merge(counting_countables(citation_tree),averaging_averageables(citation_tree))
    
    citation_tree['root'] = root
    
    return citation_tree

In [30]:
data = pd.DataFrame(columns = ['gen', 'inventor', 'assignee', 'location', 'id', 'similarity', 'claims','root'])

In [31]:
data.head()

Unnamed: 0,gen,inventor,assignee,location,id,similarity,claims,root


In [31]:
for sad in sad_loop_to_go_around[:500]:
    data = pd.concat([data, generational_information(sad)], ignore_index=True)

StopIteration: 

In [None]:
data.to_csv("generation_pt1.csv", index = False)

In [None]:
for sad in sad_loop_to_go_around[500:557]:
    print(sad)
    data = pd.concat([data, generational_information(sad)], ignore_index=True)

In [None]:
data.to_csv("generation_pt2.csv", index = False)

### This is the DEVIL. Not working on my LAPTOP

In [None]:
for sad in sad_loop_to_go_around[557:558]:
    print(sad)
    data = pd.concat([data, generational_information(sad)], ignore_index=True)

5197970


In [None]:
data.to_csv("generation_pt3.csv", index = False)

In [31]:
data = pd.DataFrame(columns = ['gen', 'inventor', 'assignee', 'location', 'id', 'similarity', 'claims','root'])

In [32]:
for sad in sad_loop_to_go_around[558:]:
    print(sad)
    data = pd.concat([data, generational_information(sad)], ignore_index=True)

5010851
6094596
6275463
6021209
5096345
4272417
4981453
6165874
4729553
4558414
5348402
6078742
4181144
4660682
6254553
4299723
5155028
4137792
4271458
5429489
5728178
4632332
4853141
5991616
6177656
4693277
5229370
4713757
4148289
4425504
5832399
5729379
4889938
4436302
4931854
5426776
6003381
5307034
5588730
6259105
5077223
5678805
6211554
4891810
5575500
4421681
5187685
5563203
6080683
5502918
6040079
5704859
5312293
6178711
5220879
5389975
5626579
6005499
5590924
4326363
5909375
4949385
5134655
4278409
5683322
4318563
4621552
4433315
5168832
6330934
5763673
4902922
5153860
5900062
5754365
5679584
5774493
5853245
5399051
4574466
4941175
5842010
5420810
6270798
5338560
4714397
5305697
5345586
4341631
4958648
5088153
4472253
5314729
4856136
5731642
5409650
4886628
5154125
4889959
4840301
6202078
4832616
4312754
5613483
5156237
5037383
6030805
4908267
5309441
4216128
5092631
4969302
5012312
4427255
4219297
5943931
5997929
6106183
6088171
4798041
4425567
4339051
4748657
4758146
4164462


In [33]:
data.to_csv("generation_pt4.csv", index = False)

In [2]:
df1 = pd.read_csv("generation_pt1.csv")
df2 = pd.read_csv("generation_pt2.csv")
df3 = pd.read_csv("generation_pt4.csv")

In [5]:
df = pd.concat([df1,df2,df3])

In [7]:
df.shape

(2814, 8)

In [9]:
df.to_csv("generation.csv", index = False)