# Getting Low Hanging Fruits 
Characterizing patents with their citation trees, in this specific part, we look at some easy statistics that can be obtained from the citation trees. As for now, the following attributes of the citation tree is mined:
* Number of Edges (Citing-Cited Relationship)
* Number of Patents
* Edge Density
* Number of Assignees
* Number of Inventors
* Number of Locations
* Average Number of Claims
* Average Similarity of Direct Citing-Cited Relationship

A random sample of 1000 is taken with IDs that range in [4136359,6331415] to control for time (older patents tend to have more citations all else equal). The measures from the important patents list are also included.

In [1]:
import neo4j 
import pandas as pd
import random
from functools import reduce
from credentials import uri, user, pwd
from patent_neo4j.connection import Neo4jConnection

**Important Patent List**

In [2]:
df = pd.read_csv("Data/important_patents_list.csv", usecols = ["id", "issue_year"])
df.head(5)

Unnamed: 0,id,issue_year
0,4136359,1979
1,4229761,1980
2,4237224,1980
3,4363877,1982
4,4371752,1983


**Sampling with Imposed Range**

In [3]:
num_sample = 1000
random_list = random.sample(range(df.loc[0,'id'], df.loc[len(df)-1,'id']), num_sample)

In [4]:
conn = Neo4jConnection(uri, user, pwd)

**Initializing Dataframe**

In [5]:
dataset = pd.DataFrame(columns = ["id","num_edge","num_patents","edge_density",
                                  "unq_assignees","unq_inventors", "unq_loc",
                                  "avg_claims","avg_sim"])

In [6]:
dataset.head()

Unnamed: 0,id,num_edge,num_patents,edge_density,unq_assignees,unq_inventors,unq_loc,avg_claims,avg_sim


**Omnibus Function** <br>
All measures are inner functions of this function, with implementation separated for readability

In [10]:
"""
Ugly omnibus function with way too many inner functions.
Used to look worse but I don't want to have too simple functions 
that have no reuse value and the best way to encapsulate it is as such
INPUT:
    citation_tree and patent_id (of root)
OUTPUT:
    List containing [id (root),num_edge, num_patents, edge_density, unq_assignees, unq_inventors,
    unq_loc, avg_claims, avg_sim]
"""
def omnibus_fx(citation_tree, patent_id):
    def count_num_edge(citation_tree):
        # Make Deep Copy of Citation Tree
        num_edges = citation_tree[['id','lineage']].copy()
        # Take direct ancestor (existent of edge)
        num_edges.loc[:,'lineage'] = num_edges.loc[:,'lineage'].apply(lambda x: x[0])
        # Drop duplicates from assignee/inventor information
        num_edges = num_edges.drop_duplicates()
        
        n = len(num_edges)
        
        del num_edges

        # Return |E|
        return n
    
    def count_num_patents(citation_tree):
        patent_set = set(citation_tree['id'])
        num_patent = len(patent_set)
        
        del patent_set

        return num_patent

    def edge_density(num_edges, num_patents):
        d = num_edges/num_patents

        return d
    
    def num_assignees(citation_tree):
        assignee_set = set(citation_tree['assignee'])
        num_assignees = len(assignee_set)
        
        del assignee_set

        return num_assignees
    
    def num_locations(citation_tree):
        location_set = set(citation_tree['location'])
        num_locations = len(location_set)
        
        del location_set

        return num_locations
    
    def num_inventors(citation_tree):
        inventor_set = set(citation_tree['inventor'])
        num_inventors = len(inventor_set)
        
        del inventor_set

        return num_inventors
    
    def avg_similarity(citation_tree):
        director_sim = list(set(citation_tree['similarity'].apply(lambda x: x[0])))
        avg_similarity = sum(director_sim)/len(director_sim)
        
        del director_sim

        return avg_similarity
    
    def avg_claims(citation_tree):
        claims = citation_tree[['id','claims']]
        claims = claims.drop_duplicates()
        claims = claims.dropna()

        num_claims = list(pd.to_numeric(claims['claims']))
        avg_claims = sum(num_claims)/len(num_claims)
        
        del claims

        return avg_claims

    # Using the functions above
    num_edge = count_num_edge(citation_tree)
    num_patents = count_num_patents(citation_tree)
    e_density = edge_density(num_edge, num_patents)
    unq_assignees = num_assignees(citation_tree)
    unq_inventors = num_inventors(citation_tree)
    unq_loc = num_locations(citation_tree)
    avg_claims = avg_claims(citation_tree)
    avg_sim = avg_similarity(citation_tree)
    
    info_list = [patent_id,num_edge,num_patents,e_density,unq_assignees,unq_inventors,unq_loc,avg_claims,avg_sim]
    
    return info_list

Just for a peak

In [8]:
x = "4296981"
citation_tree = conn.query_citation_tree(x)
citation_tree.head()

Unnamed: 0,id,date,country,claims,kind,assignee,location,inventor,lineage,similarity
0,8887444,2014-11-18,US,8,B2,d77128d3-9a3b-4be9-ab41-8ee8e296abd4,b232b40e-791e-11eb-bfee-121df0c29c1e,fl:a_ln:klien-1,"[4461519, 4296981]","[0.14937755465507507, 0.4043945968151093]"
1,8887444,2014-11-18,US,8,B2,d77128d3-9a3b-4be9-ab41-8ee8e296abd4,b232b40e-791e-11eb-bfee-121df0c29c1e,fl:d_ln:slomski-2,"[4461519, 4296981]","[0.14937755465507507, 0.4043945968151093]"
2,8887444,2014-11-18,US,8,B2,d77128d3-9a3b-4be9-ab41-8ee8e296abd4,b232b40e-791e-11eb-bfee-121df0c29c1e,fl:w_ln:kalempa-1,"[4461519, 4296981]","[0.14937755465507507, 0.4043945968151093]"
3,10450796,2019-10-22,US,19,B2,105ad3b8-c491-4f61-992e-95534b9a2243,0d3c550f-791e-11eb-bfee-121df0c29c1e,fl:d_ln:seuberling-1,"[8887444, 4461519, 4296981]","[0.16314050555229187, 0.14937755465507507, 0.4..."
4,5496105,1996-03-05,US,18,A,a7966fa3-9474-4497-a918-ff3b5fba9043,4f771afb-7920-11eb-bfee-121df0c29c1e,fl:s_ln:strait-3,"[4461519, 4296981]","[0.2783316075801849, 0.4043945968151093]"


**Unexpectedly Long Run** <br>
Obtaining Citation Tree and Running Omnibus Function

In [26]:
data = []
for i in range(num_sample):
    citation_tree = conn.query_citation_tree(random_list[i])
    if len(citation_tree) != 0:
        data.append(omnibus_fx(citation_tree,random_list[i]))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [27]:
dataset = pd.DataFrame(data, columns = ["id","num_edge","num_patents","edge_density",
                                  "unq_assignees","unq_inventors", "unq_loc",
                                  "avg_claims","avg_sim"])

In [28]:
dataset.head()

Unnamed: 0,id,num_edge,num_patents,edge_density,unq_assignees,unq_inventors,unq_loc,avg_claims,avg_sim
0,4962875,9,9,1.0,6,25,6,16.555556,0.23215
1,6207286,4,4,1.0,3,10,2,8.0,0.345882
2,4651564,45578,19149,2.380177,3083,22246,1746,20.104657,0.243059
3,5291810,63,52,1.211538,34,79,38,15.27451,0.261469
4,4372077,1829,1251,1.46203,545,1670,421,20.593424,0.247469


In [29]:
dataset.to_csv("sample_patents_stats.csv", index=False)

In [8]:
important_patents = list(df['id'])

In [19]:
important_data = []
for i in important_patents:
    citation_tree = conn.query_citation_tree(i)
    print(i)
    important_data.append(omnibus_fx(citation_tree,i))
    del citation_tree

4136359
4229761
4237224
4363877
4371752
4399216
4437122
4464652
4468464
4590598
4634665


KeyboardInterrupt: 

In [21]:
dataset_important = pd.DataFrame(important_data, columns = ["id","num_edge","num_patents","edge_density",
                                  "unq_assignees","unq_inventors", "unq_loc",
                                  "avg_claims","avg_sim"])

In [None]:
dataset_important.to_csv("important_patents_stats.csv", index=False)

In [23]:
dataset_important.head(15)

Unnamed: 0,id,num_edge,num_patents,edge_density,unq_assignees,unq_inventors,unq_loc,avg_claims,avg_sim
0,4136359,7636,5018,1.521722,897,6324,561,20.930876,0.226538
1,4229761,614,510,1.203922,191,706,146,20.56778,0.232614
2,4237224,129110,43882,2.942209,6534,47583,2877,20.112501,0.290865
3,4363877,18967,10374,1.828321,1778,11986,1012,20.109875,0.270437
4,4371752,257121,87664,2.933028,9913,92648,3330,22.266791,0.209951
5,4399216,65890,25175,2.617279,4027,28865,1912,20.08501,0.320711
6,4437122,33985,21389,1.588901,2578,23742,1350,20.55988,0.218899
7,4464652,135506,52105,2.600633,6404,59546,2632,21.847053,0.199773
8,4468464,35993,13024,2.76359,2414,15572,1290,20.513907,0.300627
9,4590598,4629,719,6.438108,192,771,150,20.267409,0.437088
