In [1]:
import pandas as pd
import glob
import csv
import gc
from tqdm.notebook import tqdm
from scipy.spatial import distance
import numpy as np

datadir = "/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/"

In [2]:
tls201_files = glob.glob(datadir + "tls201*.zip")
print(tls201_files)
tls201 = pd.concat((pd.read_csv(f, compression="zip", usecols=['appln_id','appln_auth','appln_nr','appln_kind',
                                                               'appln_filing_year', 'granted']) for f in tls201_files))

['/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls201_part01.zip', '/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls201_part02.zip']


  after removing the cwd from sys.path.


In [3]:
# USE Patent only
tls201 = tls201[tls201["appln_kind"] =="A "]

In [4]:
tls209_files = glob.glob(datadir + "tls209*.zip")
print(tls209_files)
tls209 = pd.concat((pd.read_csv(f, compression="zip", usecols=['appln_id','ipc_class_symbol']) for f in tls209_files))

['/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls209_part01.zip', '/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls209_part02.zip']


In [5]:
tls209["ipc_class_symbol"] = tls209["ipc_class_symbol"].str.split().str[0]
tls209 = tls209.drop_duplicates()

In [6]:
tls207_files = glob.glob(datadir + "tls207*.zip")
print(tls207_files)
tls207 = pd.concat((pd.read_csv(f, compression="zip") for f in tls207_files))

['/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls207_part01.zip']


In [7]:
# applt_seq_nr = 0 inidcates one is not a applicant
# invt_seq_nr = 0 inidcates one is not a applicant
tls207_applt = tls207[tls207["applt_seq_nr"] > 0][["person_id", "appln_id"]].drop_duplicates()
tls207_invt = tls207[tls207["invt_seq_nr"] > 0][["person_id", "appln_id"]].drop_duplicates()

In [8]:
# Read Country Code for persons
tls206_files = glob.glob(datadir + "tls206*.zip")
print(tls206_files)
tls206 = pd.concat((pd.read_csv(f, compression="zip", usecols=["person_id", "person_ctry_code"]) for f in tls206_files))

['/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls206_part01.zip', '/mnt/ResearchData/04_PATSTAT/00_PATSTAT_2019SP/tls206_part02.zip']


In [9]:
# Read Country Code and application id
tls207_invt = tls207_invt.merge(tls206)
tls207_applt = tls207_applt.merge(tls206)

In [16]:
# merge application information and person informations
applt_table = tls201.merge(tls207_applt.merge(tls209))
invt_table = tls201.merge(tls207_invt.merge(tls209))

In [17]:
applt_table = applt_table.groupby(["person_ctry_code", "ipc_class_symbol", "appln_filing_year"])["appln_id"].count().fillna(0).reset_index()
invt_table = invt_table.groupby(["person_ctry_code", "ipc_class_symbol", "appln_filing_year"])["appln_id"].count().fillna(0).reset_index()

applt_table = applt_table.rename(columns={"appln_id":"count"}).drop_duplicates()
invt_table = invt_table.rename(columns={"appln_id":"count"}).drop_duplicates()

In [66]:
reslist = []
for current_year in tqdm(range(1950, 2020)):
    current = applt_table[applt_table["appln_filing_year"]==current_year].set_index(["person_ctry_code", "ipc_class_symbol"])["count"].unstack().fillna(0).drop("-", axis=1, errors='ignore')
    for row in current.itertuples():
        vec1 = np.array(row[1:])
        ctry1 = row[0]
        for row2 in current.itertuples():
            vec2 = np.array(row2[1:])
            ctry2 = row2[0]
            if(ctry1 == ctry2):
                continue
            else:
                cossim = 1 - distance.cosine(vec1, vec2)
                jacsim = 1 - distance.jaccard(vec1, vec2)
            reslist.append((ctry1, ctry2, current_year, cossim, jacsim))
            
df_sim01 = pd.DataFrame(reslist, columns=["ctry01", "ctry02", "year", "cosinesim", "jaccardsim"])
df_sim01 = df_sim01[(df_sim01["ctry01"] != "  ") & (df_sim01["ctry02"] !="  ")]
df_sim01.to_csv("./SupplementaryDataGenerate/output/patstat_applt_ipc4_similarity_unnormalized.tsv", sep="\t", index=None)

  0%|          | 0/70 [00:00<?, ?it/s]

In [67]:
reslist2 = []
for current_year in tqdm(range(1950, 2020)):
    current = invt_table[invt_table["appln_filing_year"]==current_year].set_index(["person_ctry_code", "ipc_class_symbol"])["count"].unstack().fillna(0).drop("-", axis=1, errors='ignore')
    for row in current.itertuples():
        vec1 = np.array(row[1:])
        ctry1 = row[0]
        for row2 in current.itertuples():
            vec2 = np.array(row2[1:])
            ctry2 = row2[0]
            if(ctry1 == ctry2):
                continue
            else:
                cossim = 1 - distance.cosine(vec1, vec2)
                jacsim = 1 - distance.jaccard(vec1, vec2)
            reslist2.append((ctry1, ctry2, current_year, cossim, jacsim))
            
df_sim02 = pd.DataFrame(reslist, columns=["ctry01", "ctry02", "year", "cosinesim", "jaccardsim"])
df_sim02 = df_sim02[(df_sim02["ctry01"] != "  ") & (df_sim02["ctry02"] !="  ")]
df_sim02.to_csv("./SupplementaryDataGenerate/output/patstat_invt_ipc4_similarity_unnormalized.tsv", sep="\t", index=None)

  0%|          | 0/70 [00:00<?, ?it/s]

In [68]:
reslist3 = []
for current_year in tqdm(range(1950, 2020)):
    current = applt_table[applt_table["appln_filing_year"]==current_year].set_index(["person_ctry_code", "ipc_class_symbol"])["count"].unstack().fillna(0).drop("-", axis=1, errors='ignore')
    current = current / current.sum()    
    for row in current.itertuples():
        vec1 = np.array(row[1:])
        ctry1 = row[0]
        for row2 in current.itertuples():
            vec2 = np.array(row2[1:])
            ctry2 = row2[0]
            if(ctry1 == ctry2):
                continue
            else:
                cossim = 1 - distance.cosine(vec1, vec2)
                jacsim = 1 - distance.jaccard(vec1, vec2)
            reslist3.append((ctry1, ctry2, current_year, cossim, jacsim))
            
df_sim03 = pd.DataFrame(reslist3, columns=["ctry01", "ctry02", "year", "cosinesim", "jaccardsim"])
df_sim03 = df_sim03[(df_sim03["ctry01"] != "  ") & (df_sim03["ctry02"] !="  ")]
df_sim03.to_csv("./SupplementaryDataGenerate/output/patstat_applt_ipc4_similarity_ipc4normalized.tsv", sep="\t", index=None)

  0%|          | 0/70 [00:00<?, ?it/s]

In [70]:
reslist4 = []
for current_year in tqdm(range(1950, 2020)):
    current = invt_table[invt_table["appln_filing_year"]==current_year].set_index(["person_ctry_code", "ipc_class_symbol"])["count"].unstack().fillna(0).drop("-", axis=1, errors='ignore')
    current = current / current.sum()    
    for row in current.itertuples():
        vec1 = np.array(row[1:])
        ctry1 = row[0]
        for row2 in current.itertuples():
            vec2 = np.array(row2[1:])
            ctry2 = row2[0]
            if(ctry1 == ctry2):
                continue
            else:
                cossim = 1 - distance.cosine(vec1, vec2)
                jacsim = 1 - distance.jaccard(vec1, vec2)
            reslist4.append((ctry1, ctry2, current_year, cossim, jacsim))
            
df_sim04 = pd.DataFrame(reslist4, columns=["ctry01", "ctry02", "year", "cosinesim", "jaccardsim"])
df_sim04 = df_sim04[(df_sim04["ctry01"] != "  ") & (df_sim04["ctry01"] !="  ")]
df_sim04.to_csv("./SupplementaryDataGenerate/output/patstat_invt_ipc4_similarity_ipc4normalized.tsv", sep="\t", index=None)

  0%|          | 0/70 [00:00<?, ?it/s]