## Get distance from IPC section code only

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cosine
from matplotlib.colors import LogNorm
from scipy import sparse
from sklearn.metrics import jaccard_similarity_score

%matplotlib inline
%reload_ext line_profiler

In [2]:
# read in cleaned and updated data file with IPC codes etc.

patent_ipc_df = pd.read_csv('pat76_06_ipc_df_modified.csv')

In [3]:
patent_ipc_df.tail().T

Unnamed: 0,4856176,4856177,4856178,4856179,4856180
appyear,2000,2000,2000,2000,2000
cat,2,2,2,2,2
gyear,2006,2006,2006,2006,2006
icl,G06F 17/30,G06F 15/16,G06K 9/00,H04L 9/00,H04K 1/00
icl_class,G06F,G06F,G06K,H04L,H04K
icl_maingroup,17,15,9,9,1
iclnum,2,3,4,5,6
nclass,726,726,726,726,726
numipc,6,6,6,6,6
patent,7155745,7155745,7155745,7155745,7155745


In [4]:
# create list for unique_icl_section. Should just be the A to H sections
unique_icl_section_list = ['A','B','C','D','E','F','G','H']

In [5]:
# create (sorted) list of unique assignee numbers
unique_patents = patent_ipc_df.patent.unique()

In [6]:
# create starter df for feature vectors based on section
feature_vectors_df = pd.DataFrame({'icl_class' : ['A','B','C','D','E','F','G','H']})

In [7]:
feature_vectors_df

Unnamed: 0,icl_class
0,A
1,B
2,C
3,D
4,E
5,F
6,G
7,H


In [8]:
# TRY TO ONLY USE LISTS, TO THE VERY END
# now try and get vectors for each assignee of the number of patents they hold in each section

# num = number of patents to look at (to test code with small samples first)
def get_patent_section_counts_for_patents(num):
    feature_vectors_listoflist =[]
    # first get list of (unique) assignees (try first five first to test code)
    range_of_patents = unique_patents[0:num]
    # then for each one of them, 
    for a_patent in range_of_patents:
        # print('patent number = ',a_patent)
        # look at all the patent sections each patent has
        temp = patent_ipc_df[patent_ipc_df.patent==a_patent].section
        icl_codes_in_current_list = list(temp.values)
        #  and count instances of these if multiple instances of any 
        n = len(unique_icl_section_list)
        icl_codes_per_patent_list = []
        for m in range(0,n):
            current_icl_section = unique_icl_section_list[m]
            counted = icl_codes_in_current_list.count(current_icl_section)
            icl_codes_per_patent_list.append(counted)
        feature_vectors_listoflist.append((icl_codes_per_patent_list))
    return(feature_vectors_listoflist)

In [9]:
%%time
j=10000
# %lprun -f get_patent_section_counts_for_patents feature_lists = get_patent_section_counts_for_patents(j)
section_counts_per_patent = get_patent_section_counts_for_patents(j)

CPU times: user 1min 3s, sys: 8.19 s, total: 1min 11s
Wall time: 1min 11s


In [10]:
section_counts_per_patent[-10:]

[[0, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 2, 0, 0, 0, 0, 0],
 [0, 2, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 4, 0],
 [0, 0, 0, 0, 0, 0, 3, 0],
 [0, 0, 0, 0, 0, 0, 2, 0]]

In [11]:
patent_ipc_df[patent_ipc_df.patent == 3930273].T

Unnamed: 0,2,3
appyear,1975,1975
cat,6,6
gyear,1976,1976
icl,A47D 702,A47C 2100
icl_class,A47D,A47C
icl_maingroup,7,21
iclnum,1,2
nclass,5,5
numipc,2,2
patent,3930273,3930273


In [12]:
section_counts_per_patent_df = pd.DataFrame(section_counts_per_patent)

In [13]:
# add column names to df from unique_icl_section_list names

section_counts_per_patent_df.columns = unique_icl_section_list

In [14]:
section_counts_per_patent_df.tail()

Unnamed: 0,A,B,C,D,E,F,G,H
9995,0,1,0,0,0,0,0,0
9996,0,0,0,0,0,0,1,0
9997,0,0,0,0,0,0,4,0
9998,0,0,0,0,0,0,3,0
9999,0,0,0,0,0,0,2,0


In [89]:
# add patent numbers as index 
# first get required number (function of j, remember), then convert to integer
patents_for_index = unique_patents[:len(section_counts_per_patent_df)]
patents_for_index
# then change index for df
indexed_section_counts_per_patent_df = section_counts_per_patent_df.set_index(patents_for_index)
indexed_section_counts_per_patent_df.tail()

Unnamed: 0,A,B,C,D,E,F,G,H
3940269,0,1,0,0,0,0,0,0
3940270,0,0,0,0,0,0,1,0
3940271,0,0,0,0,0,0,4,0
3940272,0,0,0,0,0,0,3,0
3940273,0,0,0,0,0,0,2,0


In [91]:
indexed_section_counts_per_patent_df_clipped = indexed_section_counts_per_patent_df.clip(0,1)

indexed_section_counts_per_patent_df_clipped.tail()

Unnamed: 0,A,B,C,D,E,F,G,H
3940269,0,1,0,0,0,0,0,0
3940270,0,0,0,0,0,0,1,0
3940271,0,0,0,0,0,0,1,0
3940272,0,0,0,0,0,0,1,0
3940273,0,0,0,0,0,0,1,0


In [16]:
# sanity check on patent 3940271, which should have 4 G-section codes
patent_ipc_df[patent_ipc_df.patent==3940271].T

Unnamed: 0,14269,14270,14271,14272
appyear,1974,1974,1974,1974
cat,1,1,1,1
gyear,1976,1976,1976,1976
icl,G03C 724,G03C 700,G03C 532,G03C 176
icl_class,G03C,G03C,G03C,G03C
icl_maingroup,7,7,5,1
iclnum,1,2,3,4
nclass,430,430,430,430
numipc,4,4,4,4
patent,3940271,3940271,3940271,3940271


In [17]:
def cos_sim_matrix_from_patent_section_counts(num):
    # go through each column header and compare it to the first by cosine similarity
    # NOTE: does not like NaN, so cleared these out earlier

    # set number of patents to work with
    pnum = num
    # create a matrix to hold results
    cos_ang_mat = np.zeros((pnum,pnum))
    for i in range(0,pnum):
        tempi = unique_patents[i]
        for j in range(1,pnum):
            tempj = unique_patents[j]
            cos_similarity = 1-cosine(indexed_section_counts_per_patent_df.loc[tempi], indexed_section_counts_per_patent_df.loc[tempj])
            cos_ang_mat[i,j] = cos_similarity
            # print cos_similarity if above a threshold, and i and j are not pointing to the same patent
            if (cos_similarity>0.9) & (tempi!=tempj):
                print('close for patents',tempi,tempj,', with cos_ang = ',cos_similarity)
    return(cos_ang_mat)

In [18]:
cos_sim_matrix_from_patent_section_counts(100)

close for patents 3930271 3930272 , with cos_ang =  1.0
close for patents 3930271 3930273 , with cos_ang =  1.0
close for patents 3930271 3930275 , with cos_ang =  1.0
close for patents 3930271 3930280 , with cos_ang =  1.0
close for patents 3930271 3930281 , with cos_ang =  1.0
close for patents 3930271 3930282 , with cos_ang =  1.0
close for patents 3930271 3930283 , with cos_ang =  1.0
close for patents 3930271 3930284 , with cos_ang =  1.0
close for patents 3930271 3930289 , with cos_ang =  1.0
close for patents 3930271 3930311 , with cos_ang =  1.0
close for patents 3930271 3930312 , with cos_ang =  1.0
close for patents 3930271 3930322 , with cos_ang =  1.0
close for patents 3930271 3930328 , with cos_ang =  1.0
close for patents 3930271 3930329 , with cos_ang =  1.0
close for patents 3930271 3930330 , with cos_ang =  1.0
close for patents 3930271 3930331 , with cos_ang =  1.0
close for patents 3930271 3930332 , with cos_ang =  1.0
close for patents 3930271 3930333 , with cos_ang

array([[ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  1.,  1.,  1.],
       [ 0.,  0.,  0., ...,  1.,  1.,  1.]])

In [38]:
def jacc_sim_matrix_from_patent_section_counts(num):
    # go through each column header and compare it to the first by cosine similarity

    # set number of patents to work with, num, to aid development
    pnum = num
    # create a matrix to hold results
    jacc_sim_mat = np.zeros((pnum,pnum))
    for i in range(0,pnum):
        tempi = unique_patents[i]
        for j in range(0,pnum):
            tempj = unique_patents[j]
            jacc_similarity = jaccard_similarity_score(indexed_section_counts_per_patent_df.loc[tempi], indexed_section_counts_per_patent_df.loc[tempj])
            jacc_sim_mat[i,j] = jacc_similarity
            # print cos_similarity if above a threshold, and i and j are not pointing to the same patent
            # if (jacc_similarity>0.9) & (tempi!=tempj):
                #print('close for patents',tempi,tempj,', with jacc_similarity = ',jacc_similarity)
    return(jacc_sim_mat)

In [39]:
indexed_section_counts_per_patent_df.head(10).T

Unnamed: 0,3930271,3930272,3930273,3930274,3930275,3930276,3930277,3930278,3930279,3930280
A,1,1,2,0,1,0,0,1,0,2
B,0,0,0,1,0,1,0,1,1,0
C,0,0,0,0,0,0,0,0,0,0
D,0,0,0,0,0,0,0,0,0,0
E,0,0,0,0,0,0,1,0,0,0
F,0,0,0,0,0,0,0,0,0,0
G,0,0,0,0,0,0,0,0,0,0
H,0,0,0,0,0,0,0,0,0,0


In [107]:
%%time
test_jacc_sim_sample = jacc_sim_matrix_from_patent_section_counts(50)

CPU times: user 1.13 s, sys: 15.8 ms, total: 1.15 s
Wall time: 1.15 s


In [108]:
test_jacc_sim_sample

array([[ 1.   ,  1.   ,  0.875, ...,  0.75 ,  0.75 ,  0.75 ],
       [ 1.   ,  1.   ,  0.875, ...,  0.75 ,  0.75 ,  0.75 ],
       [ 0.875,  0.875,  1.   , ...,  0.75 ,  0.75 ,  0.75 ],
       ..., 
       [ 0.75 ,  0.75 ,  0.75 , ...,  1.   ,  1.   ,  1.   ],
       [ 0.75 ,  0.75 ,  0.75 , ...,  1.   ,  1.   ,  1.   ],
       [ 0.75 ,  0.75 ,  0.75 , ...,  1.   ,  1.   ,  1.   ]])

In [109]:
# Jaccard similarity, buy using clipped df, i.e. value - 1 for any number of a sections classes
def jacc_sim_matrix_from_patent_section_counts_binary(num):
    # go through each column header and compare it to the first by cosine similarity
    # set number of patents to work with, num, to aid development
    pnum = num
    # create a matrix to hold results
    jacc_sim_mat = np.zeros((pnum,pnum))
    for i in range(0,pnum):
        tempi = unique_patents[i]
        for j in range(0,pnum):
            tempj = unique_patents[j]
            jacc_similarity = jaccard_similarity_score(indexed_section_counts_per_patent_df_clipped.loc[tempi], indexed_section_counts_per_patent_df_clipped.loc[tempj])
            jacc_sim_mat[i,j] = jacc_similarity
            # print cos_similarity if above a threshold, and i and j are not pointing to the same patent
            # if (jacc_similarity>0.9) & (tempi!=tempj):
                #print('close for patents',tempi,tempj,', with jacc_similarity = ',jacc_similarity)
    return(jacc_sim_mat)

In [123]:
%%time
test_jacc_sim_sample_binary = jacc_sim_matrix_from_patent_section_counts_binary(100)

CPU times: user 4.47 s, sys: 36.6 ms, total: 4.51 s
Wall time: 4.54 s


In [124]:
test_jacc_sim_sample_binary

array([[ 1.  ,  1.  ,  1.  , ...,  0.75,  0.75,  0.75],
       [ 1.  ,  1.  ,  1.  , ...,  0.75,  0.75,  0.75],
       [ 1.  ,  1.  ,  1.  , ...,  0.75,  0.75,  0.75],
       ..., 
       [ 0.75,  0.75,  0.75, ...,  1.  ,  1.  ,  1.  ],
       [ 0.75,  0.75,  0.75, ...,  1.  ,  1.  ,  1.  ],
       [ 0.75,  0.75,  0.75, ...,  1.  ,  1.  ,  1.  ]])

In [125]:
test_jacc_dist_sample_binary = 1 - test_jacc_sim_sample_binary

In [126]:
test_jacc_dist_sample_binary[30]

array([ 0.25 ,  0.25 ,  0.25 ,  0.   ,  0.25 ,  0.   ,  0.25 ,  0.125,
        0.   ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.   ,
        0.   ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.125,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.25 ,  0.25 ,  0.   ,  0.   ,
        0.25 ,  0.25 ,  0.   ,  0.   ,  0.125,  0.25 ,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.375,  0.25 ,  0.   ,  0.25 ,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,
        0.375,  0.25 ,  0.25 ,  0.375,  0.25 ,  0.25 ,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ])

In [129]:
for z in range(0,20):
    print('z = ', z, max(test_jacc_dist_sample_binary[z]))

z =  0 0.5
z =  1 0.5
z =  2 0.5
z =  3 0.375
z =  4 0.5
z =  5 0.375
z =  6 0.5
z =  7 0.5
z =  8 0.375
z =  9 0.5
z =  10 0.5
z =  11 0.5
z =  12 0.5
z =  13 0.5
z =  14 0.5
z =  15 0.375
z =  16 0.375
z =  17 0.375
z =  18 0.5
z =  19 0.625


In [136]:
test_jacc_dist_sample_binary[19]

array([ 0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.5  ,  0.125,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.5  ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.   ,  0.5  ,  0.5  ,  0.5  ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.375,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.5  ,  0.5  ,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.375,  0.25 ,  0.5  ,  0.25 ,
        0.25 ,  0.25 ,  0.375,  0.25 ,  0.25 ,  0.5  ,  0.5  ,  0.5  ,
        0.5  ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.   ,
        0.25 ,  0.5  ,  0.5  ,  0.5  ,  0.5  ,  0.25 ,  0.25 ,  0.25 ,
        0.25 ,  0.5  ,  0.25 ,  0.25 ,  0.5  ,  0.5  ,  0.5  ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.25 ,  0.5  ,  0.5  ,  0.5  ,
        0.625,  0.5  ,  0.5  ,  0.375,  0.5  ,  0.25 ,  0.25 ,  0.25 ,
        0.25 ,  0.25 ,  0.25 ,  0.25 ])

In [134]:
indexed_section_counts_per_patent_df_clipped.head(20)

Unnamed: 0,A,B,C,D,E,F,G,H
3930271,1,0,0,0,0,0,0,0
3930272,1,0,0,0,0,0,0,0
3930273,1,0,0,0,0,0,0,0
3930274,0,1,0,0,0,0,0,0
3930275,1,0,0,0,0,0,0,0
3930276,0,1,0,0,0,0,0,0
3930277,0,0,0,0,1,0,0,0
3930278,1,1,0,0,0,0,0,0
3930279,0,1,0,0,0,0,0,0
3930280,1,0,0,0,0,0,0,0


patent number 3930290 (from 1976) has the largest distance from the first patent, at 0.625.
What is the number of forward citations for this patent?
It has 5 cites and 5 claims


patent number 3930290 (from 1976) has the largest distance from the first patent, at 0.625.
What is the number of forward citations for this patent?
It has 5 cites and 5 claims


In [76]:
x= [0, 2, 1, 3]
y = [0, 1, 2, 3]
jaccard_similarity_score(x, y)

0.5

In [106]:
x = [1,0,0,0,0,0,0,0]
y = [0,0,0,0,0,0,0,0]
jaccard_similarity_score(x, y)

0.875

In [66]:
for n in range(0,len(test_jacc_sim_sample)):
    x = min(test_jacc_sim_sample[n])
    print ('at n = ',n,' minimum = ',x)


at n =  0  minimum =  0.75
at n =  1  minimum =  0.75
at n =  2  minimum =  0.75
at n =  3  minimum =  0.75
at n =  4  minimum =  0.75
at n =  5  minimum =  0.75
at n =  6  minimum =  0.625
at n =  7  minimum =  0.625
at n =  8  minimum =  0.75
at n =  9  minimum =  0.75


In [54]:
unique_patents[19]

3930290

In [57]:
patent_ipc_df[18:19].T

Unnamed: 0,18
appyear,1973
cat,1
gyear,1976
icl,D01G 916
icl_class,D01G
icl_maingroup,9
iclnum,2
nclass,19
numipc,2
patent,3930285
