# Baseline model

Just predict true if tf-idf cosin distance is closer than threshold.

This is intended for simplest end-to-end solution.

### Download data of training-test set 1000. (prerequisite)

Created by Paul_to_2000_dataset.ipynb.
gsutil is easier to use outside from docker container.


```
gsutil cp gs://karino2-uspatent/citations_info_2000.df.gz ../data/
gsutil cp gs://karino2-uspatent/testset_app_1000.df.gz ../data/
gsutil cp gs://karino2-uspatent/training_app_1000.df.gz ../data/
gsutil cp gs://karino2-uspatent/grants_for_2000.df.gz ../data/

```

### Load data set

In [1]:
import pandas as pd
import numpy as np

In [2]:
citations_info_target = pd.read_pickle("../data/citations_info_2000.df.gz")
training_app_df = pd.read_pickle("../data/training_app_1000.df.gz")
testset_app_df = pd.read_pickle("../data/testset_app_1000.df.gz")
grants_target_df = pd.read_pickle("../data/grants_for_2000.df.gz")

In [3]:
citations_info_target.shape

(4179, 41)

In [4]:
training_app_df.head().app_id

0    14222691
1    12515852
2    12033424
3    12402344
4    12155425
Name: app_id, dtype: int64

In [5]:
testset_app_df.head().app_id

0    14307191
1    13137006
2    12741959
3    12643447
4    14200253
Name: app_id, dtype: int64

In [6]:
grants_target_df.shape

(2524, 2)

### Retrieve just claim. Remove all tags.

This utility function might necessary for any mode.

In [7]:
import re

In [8]:
CLAIM_PAT = re.compile(r'<claims[^>]*>(.*)</claims>',re.MULTILINE|re.DOTALL)

In [9]:
TAG_PAT = re.compile(r"<.*?>")

In [10]:
def whole_xml_to_claim_xml(whole):
    mat = CLAIM_PAT.search(whole)
    return mat.group(1)

In [11]:
def whole_xml_to_claim(whole):
    return TAG_PAT.sub(' ', whole_xml_to_claim_xml(whole))

# Model evaluation

In [12]:
citations_info_target.head()

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
0,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,7391316,7391316,H20LX5QGPXXIFW4,103.0,a,1,0,1,...,1,0,1,0,0,0,0,1,2,0
1,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,6992580,6992580,H20LX5QGPXXIFW4,102.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0
2,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,6992580,6992580,H20LX5QGPXXIFW4,103.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0
3,13371769,/work/data/apps/2012/ipa120607/F_2322.xml,7774833,7774833,H20LX5QGPXXIFW4,103.0,a,1,1,1,...,1,0,1,0,0,0,0,1,2,0
4,12282000,/work/data/apps/2009/ipa090312/F_1385.xml,7411209,7411209,G9LENRJ8PPOPPY5,102.0,a,0,1,1,...,1,0,0,0,0,1,0,1,1,3


In [13]:
def set_one_answer_appid(labeldf, oneappid):
    cited_patids = citations_info_target[citations_info_target.app_id == oneappid].parsed
    labeldf.loc[oneappid] = labeldf.columns.isin(cited_patids)

In [14]:
def create_label_df():
    label_df = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    for appid in testset_app_df.app_id:
        set_one_answer_appid(label_df, appid)
    return label_df

In [15]:
label_df = create_label_df()

In [16]:
label_df.shape

(1000, 2524)

### Cofirm label df is correct

In [17]:
testset_app_df.head()

Unnamed: 0,app_id,xml
0,14307191,"<us-patent-application lang=""EN"" dtd-version=""..."
1,13137006,"<us-patent-application lang=""EN"" dtd-version=""..."
2,12741959,"<us-patent-application lang=""EN"" dtd-version=""..."
3,12643447,"<us-patent-application lang=""EN"" dtd-version=""..."
4,14200253,"<us-patent-application lang=""EN"" dtd-version=""..."


In [18]:
label_df.head()

Unnamed: 0,6837383,6837647,6837799,6837893,6837910,6838140,6838207,6838507,6838812,6838925,...,8334161,8334431,8334887,8336128,8336158,8336789,8336964,8337193,8339697,8340894
14307191,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
13137006,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12741959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12643447,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14200253,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
label_df.loc[14307191].idxmax()

7576688

In [20]:
citations_info_target[citations_info_target["app_id"]==14307191].parsed

698    7576688
Name: parsed, dtype: int64

In [21]:
label_df.loc[14307191].loc[7576688]

True

In [22]:
label_df.loc[14307191].sum()

1

### Predict test set and print summary

In [23]:
def predict_test_set(predict_func):
    """
    predict_func(claims) return NxM of boolean. N is len(claims). M is rownum of grants_target_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    predictdf = pd.DataFrame(columns=grants_target_df.parsed.values, dtype=np.bool)
    res = predict_func(testset_app_df["xml"].map(whole_xml_to_claim))
    for idx, appid in enumerate(testset_app_df.app_id):
        predictdf.loc[appid] = res[idx, :]
    return predictdf

In [24]:
def calc_TPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_FPs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]]) for one_appid in testset_app_df.app_id])

def calc_TNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][~labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_FNs(preddf, labeldf):
    return sum([sum(preddf.loc[one_appid][labeldf.loc[one_appid]] == False) for one_appid in testset_app_df.app_id])

def calc_TFPNs(preddf, labeldf):
    return calc_TPs(preddf, labeldf), calc_FPs(preddf, labeldf), calc_TNs(preddf, labeldf), calc_FNs(preddf, labeldf)

In [25]:
def calc_summary_TFPNs(TP, FP, TN, FN):
    "return acc, prec, recall, f1."
    return pd.DataFrame(columns=["acc", "prec", "recall", "f1"], data=[[(TP+TN)/(TP+FP+TN+FN), TP/(TP+FP), TP/(TP+FN), 2*TP/(2*TP+FP+FN)]])
    
def calc_summary(preddf, labeldf):
    TP, FP, TN, FN = calc_TFPNs(preddf, labeldf)
    return calc_summary_TFPNs(TP, FP, TN, FN)

### Sample evaluation code for baseline model

predict_tfidf_model is defined below. eval here later.

This is because these cells are model independent and use for other models, while predict_tfidf_model is model dependent codes.

In [100]:
pred_df = predict_test_set(predict_tfidf_model)

In [101]:
calc_summary(pred_df, label_df)

Unnamed: 0,acc,prec,recall,f1
0,0.891564,0.004106,0.901679,0.008175


# Start baseline model dependent code from here

Now common part is done.
Start model specific cells.

In [27]:
grants_target_df.head()["xml"].map(whole_xml_to_claim)

0    \n \n 1. A tool organizer for mounting to and ...
1    \n \n 1. A modular crowd and traffic control b...
2    \n \n 1. A method of aligning a putter, compri...
3    \n \n 1. Apparatus for inserting a surgical fa...
4    \n \n 1. An apparatus for gasifying a liquid o...
Name: xml, dtype: object

In [28]:
grants_target_df["claim"] = grants_target_df["xml"].map(whole_xml_to_claim)

In [29]:
grants_target_df.head()

Unnamed: 0,parsed,xml,claim
0,6837383,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. A tool organizer for mounting to and ...
1,6837647,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. A modular crowd and traffic control b...
2,6837799,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...","\n \n 1. A method of aligning a putter, compri..."
3,6837893,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. Apparatus for inserting a surgical fa...
4,6837910,"<us-patent-grant lang=""EN"" dtd-version=""v40 20...",\n \n 1. An apparatus for gasifying a liquid o...


# Convert to feature vectors and retrieve vocabulary

Doing similar things to scikit learn example  
http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

Also, this document is helpful.  
http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

## (only once) Calculate tf-idf for grants

In [31]:
import random
random.seed(1234)

In [32]:
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.5)

In [33]:
grants_features = vectorizer.fit_transform(grants_target_df["claim"])

In [34]:
grants_features.shape

(2524, 20124)

In [35]:
vocab = vectorizer.vocabulary_

In [36]:
idfvec = vectorizer.idf_

In [37]:
len(vocab.keys())

20124

In [38]:
list(vocab.items())[0:5]

[('surgeon', 17655),
 ('dimple', 5583),
 ('dependencies', 5133),
 ('xml', 20025),
 ('vofdm', 19539)]

In [39]:
len(idfvec)

20124

In [40]:
idfvec[0:5]

array([7.73538405, 4.86370443, 8.14084916, 8.14084916, 8.14084916])

### (only once) Save features, vocabulary, idf vector

In [41]:
import pickle

In [42]:
with open("../data/grants_target_tfidf_features.dat", "wb") as f:
    pickle.dump(grants_features, f)

In [43]:
with open("../data/grants_target_vocab_idf_dict.dat", "wb") as f:
    pickle.dump({"vocabulary": vocab, "idf": idfvec}, f)

### Load code

In [44]:
import pickle

In [45]:
with open("../data/grants_target_tfidf_features.dat", 'rb') as f:
    grants_features = pickle.load(f)

In [46]:
with open("../data/grants_target_vocab_idf_dict.dat", 'rb') as f:
    dic = pickle.load(f)
    vocab, idfvec = dic["vocabulary"], dic["idf"]

### Calculate tf-idf manually using vocabulary and idf vector, and check whether it's coinside.

In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
one_claim = grants_target_df.iloc[0]["claim"]

In [49]:
count_vec = CountVectorizer(vocabulary=vocab, stop_words="english", max_df = 0.5)

In [50]:
res = count_vec.fit_transform([one_claim])

In [51]:
res_arr = res.toarray()

In [52]:
res.shape

(1, 20124)

In [53]:
tf = res_arr[0]

In [54]:
answer = grants_features[0, :].toarray()

In [55]:
answer = answer[0]

In [56]:
def print_nonzero_index(arr, maxcount):
    count = 0

    for i, v in enumerate(arr):
        if v != 0:
            count+=1
            print(i)
            if count > maxcount:
                break

In [57]:
print_nonzero_index(answer, 5)

679
694
838
856
904
997


In [58]:
answer[997]

0.043562029896483084

In [59]:
print_nonzero_index(tf, 5)

679
694
838
856
904
997


In [60]:
sumtf = sum(tf)

In [61]:
unnormalized = [tf[i]*idfvec[i]/sumtf for i, _ in enumerate(tf)]

In [62]:
unnormalized[997]/np.linalg.norm(unnormalized)

0.04356202989648308

Try two claim for generarization

In [63]:
tfcsr = count_vec.fit_transform(grants_target_df.iloc[0:2]["claim"])

In [64]:
tf = tfcsr.toarray()

In [65]:
tf.shape

(2, 20124)

In [66]:
unnormalized = np.multiply(tf, idfvec)

In [67]:
lpnorms = np.linalg.norm(unnormalized, axis=1)

In [68]:
manual_tfidf = unnormalized/lpnorms[:, np.newaxis]

In [69]:
manual_tfidf[0, 997]

0.04356202989648307

In [70]:
all(abs(manual_tfidf[0, :] - grants_features[0].toarray()[0]) < 0.00001)

True

In [71]:
all(abs(manual_tfidf[1] - grants_features[1].toarray()[0]) < 0.00001)

True

Now make calculate tf-idf function

In [72]:
def claims_to_tfidfs(claimarr, count_vec, idfvec):
    tfcsr = count_vec.fit_transform(claimarr)
    tf = tfcsr.toarray()
    unnormalized = np.multiply(tf, idfvec)
    lpnorms = np.linalg.norm(unnormalized, axis=1)
    return unnormalized/lpnorms[:, np.newaxis]

In [73]:
manu3 = claims_to_tfidfs(grants_target_df.iloc[0:2]["claim"], count_vec, idfvec)

In [74]:
all(manu3[0] == manual_tfidf[0]), all(manu3[1] == manual_tfidf[1])

(True, True)

### It's time to calculate tfidf for training set.

In [75]:
training_app_df["claim"] = training_app_df["xml"].map(whole_xml_to_claim)

In [76]:
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

Calculate one cosine distance

In [77]:
one_appid = training_app_df.iloc[0].app_id

In [78]:
citations_info_target[citations_info_target.app_id == one_appid]

Unnamed: 0,app_id,app_fnm,citation_pat_pgpub_id,parsed,ifw_number,action_type,action_subtype,form892,form1449,citation_in_oa,...,rejection_103,rejection_112,rejection_dp,objection,allowed_claims,cite102_gt1,cite103_gt3,cite103_eq1,cite103_max,signature_type
1635,14222691,/work/data/apps/2014/ipa140925/F_4331.xml,8179692,8179692,I9X14IR6PXXIFW4,102.0,a,1,0,1,...,1,0,0,1,0,0,0,0,3,0
1636,14222691,/work/data/apps/2014/ipa140925/F_4331.xml,8179692,8179692,I9X14IR6PXXIFW4,103.0,,1,0,1,...,1,0,0,1,0,0,0,0,3,0
1637,14222691,/work/data/apps/2014/ipa140925/F_4331.xml,8206188,8206188,I9X14IR6PXXIFW4,102.0,a,1,0,1,...,1,0,0,1,0,0,0,0,3,0
1638,14222691,/work/data/apps/2014/ipa140925/F_4331.xml,8206188,8206188,I9X14IR6PXXIFW4,103.0,,1,0,1,...,1,0,0,1,0,0,0,0,3,0
1639,14222691,/work/data/apps/2014/ipa140925/F_4331.xml,8177561,8177561,I9X14IR6PXXIFW4,103.0,,1,0,1,...,1,0,0,1,0,0,0,0,3,0


In [79]:
answer_patids = set(citations_info_target[citations_info_target.app_id == one_appid].parsed.astype(int))

In [80]:
answer_patids

{8177561, 8179692, 8206188}

In [81]:
type(grants_target_df.iloc[0].parsed)

numpy.int64

In [82]:
answer_idxs = grants_target_df[grants_target_df.parsed.isin(answer_patids)].index

In [83]:
answer_idxs[0]

2380

In [84]:
grants_target_df.iloc[2380].parsed

8177561

In [85]:
answer_patent_features = grants_features[answer_idxs[0], :].toarray()[0]

In [86]:
import scipy

In [87]:
scipy.spatial.distance.cdist(training_features[0, :][np.newaxis, :], grants_features[answer_idxs[0], :].toarray(), 'cosine')

array([[0.67340827]])

### Calculate 20 cosine distance for check

In [88]:
training_app_df.head()

Unnamed: 0,app_id,xml,claim
0,14222691,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . A terminal comprising:\n an upper a...
1,12515852,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . A method for increasing seed yield ...
2,12033424,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . An image forming apparatus, compris..."
3,12402344,"<us-patent-application lang=""EN"" dtd-version=""...","\n \n 1 . A tunable polarization rotator, com..."
4,12155425,"<us-patent-application lang=""EN"" dtd-version=""...",\n \n 1 . An illumination control circuit com...


In [89]:
training_app_df[training_app_df.app_id == 12515852].index[0]

1

In [90]:
def calc_cosin_for_one_app(appid):
    answer_patids = set(citations_info_target[citations_info_target.app_id == appid].parsed.astype(int))
    answer_idxs = grants_target_df[grants_target_df.parsed.isin(answer_patids)].index
    answer_patent_features = grants_features[answer_idxs, :].toarray()
    training_features_idx = training_app_df[training_app_df.app_id == appid].index[0]
    return scipy.spatial.distance.cdist(training_features[training_features_idx, :][np.newaxis, :], answer_patent_features, 'cosine')[0]


In [91]:
calc_cosin_for_one_app(12515852)

array([0.4220731])

In [92]:
calc_cosin_for_one_app(12402344)

array([0.97559903, 0.89965045, 0.82466478, 0.63398944, 0.68337952])

In [93]:
[calc_cosin_for_one_app(appid) for appid in training_app_df[0:20].app_id]

[array([0.67340827, 0.76916371, 0.8174568 ]),
 array([0.4220731]),
 array([0.88221407]),
 array([0.97559903, 0.89965045, 0.82466478, 0.63398944, 0.68337952]),
 array([0.07625181]),
 array([0.92330727, 0.79996471, 0.92896083, 0.91987076]),
 array([0.90553403]),
 array([0.95056575]),
 array([0.70165577]),
 array([0.76213644]),
 array([0.99283104]),
 array([0.63671942]),
 array([0.84019419]),
 array([0.85744319, 0.65621906]),
 array([0.74489745]),
 array([0.39235124, 0.43900574]),
 array([0.58476531]),
 array([0.94180023, 0.97665386]),
 array([0.51579826, 0.67111682]),
 array([0.40108371])]

In [94]:
calc_cosin_for_one_app(training_app_df.iloc[5].app_id)

array([0.92330727, 0.79996471, 0.92896083, 0.91987076])

In [95]:
calc_cosin_for_one_app(training_app_df.iloc[5].app_id).mean() < 0.95

True

### Compare with random pair cosdistance

In [96]:
scipy.spatial.distance.cdist(training_features[0:5, :], grants_features[0:5, :].toarray(), 'cosine')

array([[0.96073889, 0.96715009, 0.9661195 , 0.96547838, 0.98200139],
       [0.9565299 , 0.94845901, 0.99889384, 0.95047153, 0.97128588],
       [0.99269734, 0.99249212, 0.99575977, 0.97310476, 0.96170229],
       [0.99960154, 0.99653818, 0.98222205, 0.99531781, 0.96584516],
       [0.85252285, 0.82698041, 0.97140084, 0.83295536, 0.90456516]])

In [97]:
training_features = claims_to_tfidfs(training_app_df["claim"], count_vec, idfvec)

In [98]:
grants_features_arr = grants_features.toarray()

In [99]:
TFIDF_MODEL_THRESHOLD=0.95
# TFIDF_MODEL_THRESHOLD=0.8

def predict_tfidf_model(claims):
    """
    return: NxM of boolean. N is len(claims). M is rownum of grants_all_df.
            value indicate n claim is cite patent of m row of grants_all_df.
    """
    features = claims_to_tfidfs(claims, count_vec, idfvec)
    dists = scipy.spatial.distance.cdist(features, grants_features_arr, 'cosine')
    return dists < TFIDF_MODEL_THRESHOLD


Model cells are over.
Please go back to predict_test_set(predict_tfidf_model) cell.


# Below here is for BUG investigation.

manual recalculation.

### Calc recall of 100

In [175]:
res = predict_tfidf_model(training_app_df[0:100]["claim"])

In [176]:
res.shape

(100, 2524)

In [177]:
all_pred_of_labeltrue = np.array([], dtype=np.bool)

In [178]:
for idx in range(0, 100):
    one_appid = training_app_df.iloc[idx].app_id
    pred_oneres = res[idx]
    label_patids = citations_info_target[citations_info_target.app_id == one_appid].parsed
    label_idxs = grants_target_df.parsed[grants_target_df.parsed.isin(label_patids)].index
    pred_of_labeltrue = pred_oneres[label_idxs]
    all_pred_of_labeltrue = np.concatenate([all_pred_of_labeltrue, pred_of_labeltrue])

In [179]:
sum(all_pred_of_labeltrue)/len(all_pred_of_labeltrue)

0.8970588235294118

### Check result by hand (seems correct)

In [180]:
training_app_df.iloc[0]

app_id                                             14222691
xml       <us-patent-application lang="EN" dtd-version="...
claim     \n \n  1 . A terminal comprising:\n an upper a...
Name: 0, dtype: object

In [181]:
pred_oneres = res[0]

In [182]:
pred_oneres.sum()

421

In [183]:
citations_info_target[citations_info_target.app_id == 14222691].parsed

1635    8179692
1636    8179692
1637    8206188
1638    8206188
1639    8177561
Name: parsed, dtype: int64

In [185]:
grants_target_df.parsed[grants_target_df.parsed == 8179692].index

Int64Index([2381], dtype='int64')

In [186]:
pred_oneres[2381]

True

### Why test set recall is so bad?

In [187]:
res = predict_tfidf_model(testset_app_df["xml"].map(whole_xml_to_claim))

In [188]:
testset_app_df.shape

(1000, 2)

In [189]:
all_pred_of_labeltrue = np.array([], dtype=np.bool)

In [190]:
for idx in range(0, testset_app_df.shape[0]):
    one_appid = testset_app_df.iloc[idx].app_id
    pred_oneres = res[idx]
    label_patids = citations_info_target[citations_info_target.app_id == one_appid].parsed
    label_idxs = grants_target_df.parsed[grants_target_df.parsed.isin(label_patids)].index
    pred_of_labeltrue = pred_oneres[label_idxs]
    all_pred_of_labeltrue = np.concatenate([all_pred_of_labeltrue, pred_of_labeltrue])

In [191]:
sum(all_pred_of_labeltrue)/len(all_pred_of_labeltrue)

0.095163806552262087

In [196]:
training_app_df.iloc[0]

app_id                                             14222691
xml       <us-patent-application lang="EN" dtd-version="...
claim     \n \n  1 . A terminal comprising:\n an upper a...
Name: 0, dtype: object

In [195]:
testset_app_df.iloc[0]

app_id                                             14222691
xml       <us-patent-application lang="EN" dtd-version="...
Name: 0, dtype: object

In [193]:
pred_oneres = res[0]

In [194]:
pred_oneres.sum()

460

In [183]:
citations_info_target[citations_info_target.app_id == 14222691].parsed

1635    8179692
1636    8179692
1637    8206188
1638    8206188
1639    8177561
Name: parsed, dtype: int64

In [185]:
grants_target_df.parsed[grants_target_df.parsed == 8179692].index

Int64Index([2381], dtype='int64')

In [186]:
pred_oneres[2381]

True