## Clean and prepare IPC data file for later use
Convert to upper case.

Create columns for section and for intermediate level

Save to a CSV file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial.distance import cosine
from matplotlib.colors import LogNorm

%matplotlib inline

In [2]:
# read in main data, with ipc classes etc.
pat76_06_ipc_df = pd.read_stata('pat76_06_ipc.dta')


In [3]:
# convert any IPC casses to upper case if they are lower case
icl_uc = pat76_06_ipc_df.icl.apply(lambda x: x.upper())
icl_class_uc = pat76_06_ipc_df.icl_class.apply(lambda x: x.upper())

pat76_06_ipc_df['icl_uc'] = icl_uc
pat76_06_ipc_df['icl_class_uc'] = icl_class_uc

In [4]:
# create new column for the IPC section letter of the classification
pat76_06_ipc_df['section'] = pat76_06_ipc_df['icl_class_uc'].astype(str).str[0]



In [5]:
# create data from concating the icl_class (upper case version) and icl_maingroup
ipc_inter = pat76_06_ipc_df.icl_class_uc.values + pat76_06_ipc_df.icl_maingroup.values.astype(int).astype(str)

In [6]:
# add this new intermediate IPC code to the df
pat76_06_ipc_df['ipc_inter'] = ipc_inter

In [7]:
# drop those rows where the section != one of the valid sections (A to H)
keep_list = ['A','B','C','D','E','F','G','H']
keep_list
pat76_06_ipc_df_modified = pat76_06_ipc_df[pat76_06_ipc_df.section.isin(keep_list)]
print(len(pat76_06_ipc_df))
len(pat76_06_ipc_df_modified)

4857833


4856181

In [8]:
# eyeball check of updated dataframe
pat76_06_ipc_df_modified.head().T

Unnamed: 0,0,1,2,3,4
appyear,1974,1974,1975,1975,1974
cat,6,6,6,6,5
gyear,1976,1976,1976,1976,1976
icl,A41D 1900,A47D 701,A47D 702,A47C 2100,B63B 708
icl_class,A41D,A47D,A47D,A47C,B63B
icl_maingroup,19,7,7,21,7
iclnum,1,1,1,2,1
nclass,2,5,5,5,114
numipc,1,1,2,2,1
patent,3930271,3930272,3930273,3930273,3930274


In [9]:
%%time
#  save this edited dataframe
pat76_06_ipc_df_modified.to_csv('pat76_06_ipc_df_modified.csv', index=False)

CPU times: user 52.6 s, sys: 999 ms, total: 53.6 s
Wall time: 54 s


In [10]:
# check that datafile is saved
!ls -l *.csv

-rw-r--r--  1 ahuxor  staff  606712645 11 Aug 21:24 pat76_06_assg_minus_singular_ipc_inter.csv
-rw-r--r--  1 ahuxor  staff  108491068  8 Aug 12:02 pat76_06_citesnum_hjtwt_nclaims_patent.csv
-rw-r--r--  1 ahuxor  staff  510687103 19 Aug 13:56 pat76_06_ipc_df_modified.csv
