## simple user case study example


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cosine
from matplotlib.colors import LogNorm

%matplotlib inline

In [2]:
pat76_06_ipc_df = pd.read_stata('pat76_06_ipc.dta')


In [3]:
x='BNBV'
x.isupper()

True

In [4]:
for n in range(0,500):
    state = pat76_06_ipc_df.icl_class[n].isupper()
    if state:
        continue
    else:
        print('icl-class = ',pat76_06_ipc_df.icl_class[n])
        print('patent number = ',pat76_06_ipc_df.patent[n] )

icl-class =  F01k
patent number =  3930371
icl-class =  G10h
patent number =  3930430
icl-class =  G10h
patent number =  3930430
icl-class =  5 3
patent number =  3930439
icl-class =  F02m
patent number =  3930475
icl-class =  B29h
patent number =  3930529
icl-class =  F28f
patent number =  3930536
icl-class =  F28f
patent number =  3930537
icl-class =  A01b
patent number =  3930543
icl-class =  A01b
patent number =  3930543
icl-class =  B60p
patent number =  3930548
icl-class =  E04b
patent number =  3930557
icl-class =  B04b
patent number =  3930608


In [5]:
# create new column for the IPC section letter of the classification
pat76_06_ipc_df['section'] = pat76_06_ipc_df['icl_class'].astype(str).str[0]

pat76_06_ipc_G_section_df =pat76_06_ipc_df[pat76_06_ipc_df.section == 'G']

pat76_06_ipc_G_section_df.head().T

Unnamed: 0,65,69,77,78,125
appyear,1974,1975,1974,1974,1974
cat,6,6,6,6,6
gyear,1976,1976,1976,1976,1976
icl,G01C 1902,G09B 306,G09D 306,G09F 708,G04C 2116
icl_class,G01C,G09B,G09D,G09F,G04C
icl_maingroup,19,3,3,7,21
iclnum,1,1,1,1,1
nclass,33,434,40,40,368
numipc,1,2,1,1,1
patent,3930317,3930321,3930326,3930327,3930358


In [6]:
# create a new df in which duplicates -- only in icl_class and patent -- are removed
# this reduces the file size from about 4.8 million rows to 3.8 million.

pat76_06_ipc_G_section_df_dupless = pat76_06_ipc_G_section_df.drop_duplicates(['icl_class','patent'])

In [7]:
print(len(pat76_06_ipc_df))
len(pat76_06_ipc_G_section_df_dupless)

4857833


794593

In [8]:
### start looking at final example
#get a typical patent 
#a_patent = pat76_06_ipc_df[4002:4003].patent.values[0]
a_patent = 6133365 # use one we know has 2 classes attached
a_patent

6133365

In [9]:
# get all icl_classes involved in this patent
classes = pat76_06_ipc_G_section_df_dupless[pat76_06_ipc_G_section_df_dupless.patent==a_patent].icl_class.values.tolist()
classes

[]

In [10]:
#
# So now add each other class drawn from the list of all available
section_G_icl_classes = pat76_06_ipc_G_section_df_dupless.icl_class.unique().tolist()

In [11]:
# add first from this array to the classes in a temporary variable
temp = classes.copy()
temp.append('A41D')


temp

['A41D']

In [12]:
# get all patents that have all three


In [13]:
# Need to create new data structures aggregating icl_classes for each patent number
pat76_06_ipc_G_section_df_dupless.head().T

Unnamed: 0,65,69,77,78,125
appyear,1974,1975,1974,1974,1974
cat,6,6,6,6,6
gyear,1976,1976,1976,1976,1976
icl,G01C 1902,G09B 306,G09D 306,G09F 708,G04C 2116
icl_class,G01C,G09B,G09D,G09F,G04C
icl_maingroup,19,3,3,7,21
iclnum,1,1,1,1,1
nclass,33,434,40,40,368
numipc,1,2,1,1,1
patent,3930317,3930321,3930326,3930327,3930358


In [14]:
# get list of (unique) patents
all_patents = pat76_06_ipc_G_section_df_dupless.patent.values
all_patents[0]

3930317

In [15]:
%%time
list_of_patents = []
list_of_list_of_classes = []

for j in range(0,100000):
    a_patent = all_patents[j]
    list_of_patents.append(a_patent)
    list_of_classes = pat76_06_ipc_G_section_df_dupless[pat76_06_ipc_G_section_df_dupless.patent == a_patent].icl_class.values
    list_of_list_of_classes.append(list_of_classes)

CPU times: user 3min 21s, sys: 4.17 s, total: 3min 25s
Wall time: 3min 41s


In [16]:
len(list_of_list_of_classes)

100000

In [17]:
temp_dict = dict( patent = list_of_patents, icl_classes = list_of_list_of_classes )
df = pd.DataFrame.from_dict(temp_dict, orient='index').T

In [18]:
# cannot use drop_duplicates on icl_classes contains a list, so drop on patnet
df1=df.drop_duplicates(['patent'])
df1.tail(10)

Unnamed: 0,icl_classes,patent
99988,[G03G],4486502
99989,"[G03F, G03C]",4486518
99991,[G03G],4486519
99992,[G03G],4486520
99993,[G03G],4486521
99994,[G03G],4486522
99995,[G03G],4486523
99996,[G03G],4486524
99997,[G03G],4486525
99998,"[G03C, G03F]",4486526


In [22]:
# get example patent
zz = pat76_06_ipc_G_section_df_dupless[pat76_06_ipc_G_section_df_dupless.patent==4486518].icl_class.values
zz1 = zz.tolist()
zz1

['G03F', 'G03C']

In [23]:
pat76_06_ipc_G_section_df_dupless[pat76_06_ipc_G_section_df_dupless.patent==4486518].T

Unnamed: 0,825853,825854
appyear,1982,1982
cat,1,1
gyear,1984,1984
icl,G03F 100,G03C 300
icl_class,G03F,G03C
icl_maingroup,1,3
iclnum,1,2
nclass,430,430
numipc,3,3
patent,4486518,4486518


In [24]:

pat76_06_ipc_G_section_df_dupless[pat76_06_ipc_G_section_df_dupless.patent==4486518].T

Unnamed: 0,825853,825854
appyear,1982,1982
cat,1,1
gyear,1984,1984
icl,G03F 100,G03C 300
icl_class,G03F,G03C
icl_maingroup,1,3
iclnum,1,2
nclass,430,430
numipc,3,3
patent,4486518,4486518


In [25]:
# find others with exactly the same classes

exact_match = df1[(df1['icl_classes'].astype(str).str.contains('G03F')) & (df1['icl_classes'].astype(str).str.contains('G03C'))  ]

%%time
for k in range(0,len(section_G_icl_classes)):
    test_class = section_G_icl_classes[k]
    print(test_class)
    temp_df = df1[(df1['icl_classes'].astype(str).str.contains('G03F')) & (df1['icl_classes'].astype(str).str.contains('G03C')) & (df1['icl_classes'].astype(str).str.contains(test_class))  ]
    print(temp_df.patent)
    print('......')
print('complete')

## Get 'distance' between patents to get measure of x-axis in U-shaped curve

In [26]:
# create list for unique_icl_class, rather than numpy array
unique_icl_class = 
unique_icl_class_list = unique_icl_class.tolist()

NameError: name 'unique_icl_class' is not defined