# Cleaning Patent Titles: 

In [46]:
import pandas as pd
import nltk
import spacy
import glob
from tqdm import tqdm

In [47]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [35]:
path = '../data/scraped/'

In [36]:
files = glob.glob(f'{path}*.csv')
len(files)

40

In [37]:
def is_prpn_nltk(x):
    '''
    Helper function to detect proper nouns from pos tags in text
    '''
    for word, pos in x: 
        if pos == 'NNP':
            return 1
    return 0

In [38]:
def build_nltk_pos_tags(df):
    # converting title to str type
    df['title'] = df['title'].astype(str)
    # splitting title into tokens
    df['tokens'] = df['title'].apply(lambda x: x.split())
    # getting nltk pos tags from tokens
    df['nltk_pos'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
    return df

In [39]:
nlp = spacy.load("en_core_web_sm")

In [40]:
def spacy_pos(x):
    doc = nlp(x)
    return [token.pos_ for token in doc]

def spacy_label(x):
    doc = nlp(x)
    return [token.label_ for token in doc.ents]

In [41]:
def build_spacy_pos_tag(df):
    # converting title to str type
    df['title'] = df['title'].astype(str)
    df['spacy_pos'] = df['title'].apply(spacy_pos)
    df['spacy_label'] = df['title'].apply(spacy_label)
    df['is_prpn_spacy'] = df['spacy_pos'].apply(lambda x: 1 if 'PROPN' in x else 0) # checking if tokens are proper nouns
    df['is_person_spacy'] = df['spacy_label'].apply(lambda x: 1 if 'PERSON' in x else (1 if 'ORG' in x else 0)) # checking if tokens are person or organization entities
    return df

In [49]:
save_path = '../data/preprocessed/'

for file in tqdm(files):
    name = file.split('/')[-1]
    df = pd.read_csv(file)
    print(df.head())
    df = build_nltk_pos_tags(df)
    df = build_spacy_pos_tag(df)
    df.to_csv(f'{save_path}{name}', index=False)
    print(df.head())
    break

  0%|                                                                                                                                  | 0/40 [00:00<?, ?it/s]

  patent_number        date                                 title
0    US1600000A  1924-11-17    Feeler mechanism for looms \n     
1    US1600001A  1921-12-21            Sand-blast machine \n     
2    US1600002A  1925-03-24         Meat-mangling machine \n     
3    US1600003A  1925-08-14               Steam condenser \n     
4    US1600004A  1925-05-28  Stuffing box for compressors \n     


  0%|                                                                                                                                  | 0/40 [00:06<?, ?it/s]


KeyboardInterrupt: 

### NLTK POS Tags:

In [182]:
# converting title to str type
df['title'] = df['title'].astype(str)

# splitting title into tokens
df['tokens'] = df['title'].apply(lambda x: x.split())

# getting nltk pos tags from tokens
df['nltk_pos'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))

In [183]:
df.head()

Unnamed: 0,patent_number,date,title,tokens,nltk_pos
0,US100000A,1870-02-22,Improved sun-bonnet for horses,"[Improved, sun-bonnet, for, horses]","[(Improved, VBN), (sun-bonnet, NN), (for, IN), (horses, NNS)]"
1,US100001A,1870-02-22,Improvement in seed-palnter-s and fertilizer-distributers,"[Improvement, in, seed-palnter-s, and, fertilizer-distributers]","[(Improvement, NN), (in, IN), (seed-palnter-s, JJ), (and, CC), (fertilizer-distributers, NNS)]"
2,US100002A,1870-02-22,Improvement in printing-presses,"[Improvement, in, printing-presses]","[(Improvement, NN), (in, IN), (printing-presses, NNS)]"
3,US100003A,1870-02-22,bessemer,[bessemer],"[(bessemer, NN)]"
4,US100004A,1870-02-22,Improvement in hanging crank-shafts,"[Improvement, in, hanging, crank-shafts]","[(Improvement, NN), (in, IN), (hanging, VBG), (crank-shafts, NNS)]"


In [184]:
def is_prpn_nltk(x):
    '''
    Helper function to detect proper nouns from pos tags in text
    '''
    for word, pos in x: 
        if pos == 'NNP':
            return 1
    return 0

In [185]:
df['is_prpn_nltk'] = df['nltk_pos'].apply(is_prpn_nltk)

In [186]:
df.head()

Unnamed: 0,patent_number,date,title,tokens,nltk_pos,is_prpn_nltk
0,US100000A,1870-02-22,Improved sun-bonnet for horses,"[Improved, sun-bonnet, for, horses]","[(Improved, VBN), (sun-bonnet, NN), (for, IN), (horses, NNS)]",0
1,US100001A,1870-02-22,Improvement in seed-palnter-s and fertilizer-distributers,"[Improvement, in, seed-palnter-s, and, fertilizer-distributers]","[(Improvement, NN), (in, IN), (seed-palnter-s, JJ), (and, CC), (fertilizer-distributers, NNS)]",0
2,US100002A,1870-02-22,Improvement in printing-presses,"[Improvement, in, printing-presses]","[(Improvement, NN), (in, IN), (printing-presses, NNS)]",0
3,US100003A,1870-02-22,bessemer,[bessemer],"[(bessemer, NN)]",0
4,US100004A,1870-02-22,Improvement in hanging crank-shafts,"[Improvement, in, hanging, crank-shafts]","[(Improvement, NN), (in, IN), (hanging, VBG), (crank-shafts, NNS)]",0


In [187]:
df[df['is_prpn_nltk'] == 1][['title']]

Unnamed: 0,title
5,Makcar wahbam beylikgy
17,Joseph m
24,Stove gkate
35,William b
37,Austin d
...,...
99196,Henry j
99420,John balmore
99712,Samuel a
99771,Mobeis matt son


In [188]:
df[df['is_prpn_nltk'] == 0][['title']]

Unnamed: 0,title
0,Improved sun-bonnet for horses
1,Improvement in seed-palnter-s and fertilizer-distributers
2,Improvement in printing-presses
3,bessemer
4,Improvement in hanging crank-shafts
...,...
99923,Improvement in harrows
99924,Improvement in boot-jacks
99925,Improvement in ozone-generators
99926,Improvement in temporary binders


### Spacy POS Tags:

In [189]:
# nlp = spacy.load("en_core_web_sm")
# doc = nlp('Peter Jackson is a great director')

# for ent in doc.ents:
#     print(ent.label_)

In [190]:
nlp = spacy.load("en_core_web_sm")

In [191]:
def spacy_pos(x):
    doc = nlp(x)
    return [token.pos_ for token in doc]

In [192]:
def spacy_label(x):
    doc = nlp(x)
    return [token.label_ for token in doc.ents]

In [194]:
df['spacy_pos'] = df['title'].apply(spacy_pos)
df['spacy_label'] = df['title'].apply(spacy_label)

In [195]:
df.head()

Unnamed: 0,patent_number,date,title,tokens,nltk_pos,is_prpn_nltk,spacy_pos,spacy_label
0,US100000A,1870-02-22,Improved sun-bonnet for horses,"[Improved, sun-bonnet, for, horses]","[(Improved, VBN), (sun-bonnet, NN), (for, IN), (horses, NNS)]",0,"[ADJ, NOUN, PUNCT, NOUN, ADP, NOUN]",[]
1,US100001A,1870-02-22,Improvement in seed-palnter-s and fertilizer-distributers,"[Improvement, in, seed-palnter-s, and, fertilizer-distributers]","[(Improvement, NN), (in, IN), (seed-palnter-s, JJ), (and, CC), (fertilizer-distributers, NNS)]",0,"[NOUN, ADP, NOUN, PUNCT, NOUN, PUNCT, NOUN, CCONJ, NOUN, PUNCT, NOUN]",[]
2,US100002A,1870-02-22,Improvement in printing-presses,"[Improvement, in, printing-presses]","[(Improvement, NN), (in, IN), (printing-presses, NNS)]",0,"[NOUN, ADP, NOUN, PUNCT, NOUN]",[]
3,US100003A,1870-02-22,bessemer,[bessemer],"[(bessemer, NN)]",0,[NOUN],[]
4,US100004A,1870-02-22,Improvement in hanging crank-shafts,"[Improvement, in, hanging, crank-shafts]","[(Improvement, NN), (in, IN), (hanging, VBG), (crank-shafts, NNS)]",0,"[NOUN, ADP, VERB, NOUN, PUNCT, NOUN]",[]


In [196]:
df[df['spacy_label'].apply(len).gt(0)]

Unnamed: 0,patent_number,date,title,tokens,nltk_pos,is_prpn_nltk,spacy_pos,spacy_label
5,US100005A,1870-02-22,Makcar wahbam beylikgy,"[Makcar, wahbam, beylikgy]","[(Makcar, NNP), (wahbam, NN), (beylikgy, NN)]",1,"[PROPN, NOUN, NOUN]",[PERSON]
9,US100009A,1870-02-22,briggs,[briggs],"[(briggs, NNS)]",0,[PROPN],[PERSON]
17,US100017A,1870-02-22,Joseph m,"[Joseph, m]","[(Joseph, NNP), (m, NN)]",1,"[PROPN, VERB]",[PERSON]
35,US100035A,1870-02-22,William b,"[William, b]","[(William, NNP), (b, NN)]",1,"[PROPN, NOUN]",[PERSON]
37,US100037A,1870-02-22,Austin d,"[Austin, d]","[(Austin, NNP), (d, NN)]",1,"[PROPN, X]",[PERSON]
...,...,...,...,...,...,...,...,...
99494,US199566A,1878-01-22,Improvement in mosquito-net frames,"[Improvement, in, mosquito-net, frames]","[(Improvement, NN), (in, IN), (mosquito-net, NN), (frames, NNS)]",0,"[NOUN, ADP, NOUN, PUNCT, NOUN, NOUN]",[DATE]
99591,US199663A,1878-01-29,Improvement in electro-galvanic chairs,"[Improvement, in, electro-galvanic, chairs]","[(Improvement, NN), (in, IN), (electro-galvanic, JJ), (chairs, NNS)]",0,"[NOUN, ADP, PROPN, PUNCT, ADJ, NOUN]",[GPE]
99623,US199695A,1878-01-29,Improvement in portable mosquito-bars,"[Improvement, in, portable, mosquito-bars]","[(Improvement, NN), (in, IN), (portable, JJ), (mosquito-bars, NNS)]",0,"[NOUN, ADP, ADJ, NOUN, PUNCT, NOUN]",[ORDINAL]
99771,US199843A,1878-01-29,Mobeis matt son,"[Mobeis, matt, son]","[(Mobeis, NNP), (matt, PRP), (son, NN)]",1,"[PROPN, PROPN, NOUN]","[PERSON, PERSON]"


In [197]:
df['is_prpn_spacy'] = df['spacy_pos'].apply(lambda x: 1 if 'PROPN' in x else 0) # checking if tokens are proper nouns
df['is_person_spacy'] = df['spacy_label'].apply(lambda x: 1 if 'PERSON' in x else (1 if 'ORG' in x else 0)) # checking if tokens are person or organization entities

In [198]:
df.head()

Unnamed: 0,patent_number,date,title,tokens,nltk_pos,is_prpn_nltk,spacy_pos,spacy_label,is_prpn_spacy,is_person_spacy
0,US100000A,1870-02-22,Improved sun-bonnet for horses,"[Improved, sun-bonnet, for, horses]","[(Improved, VBN), (sun-bonnet, NN), (for, IN), (horses, NNS)]",0,"[ADJ, NOUN, PUNCT, NOUN, ADP, NOUN]",[],0,0
1,US100001A,1870-02-22,Improvement in seed-palnter-s and fertilizer-distributers,"[Improvement, in, seed-palnter-s, and, fertilizer-distributers]","[(Improvement, NN), (in, IN), (seed-palnter-s, JJ), (and, CC), (fertilizer-distributers, NNS)]",0,"[NOUN, ADP, NOUN, PUNCT, NOUN, PUNCT, NOUN, CCONJ, NOUN, PUNCT, NOUN]",[],0,0
2,US100002A,1870-02-22,Improvement in printing-presses,"[Improvement, in, printing-presses]","[(Improvement, NN), (in, IN), (printing-presses, NNS)]",0,"[NOUN, ADP, NOUN, PUNCT, NOUN]",[],0,0
3,US100003A,1870-02-22,bessemer,[bessemer],"[(bessemer, NN)]",0,[NOUN],[],0,0
4,US100004A,1870-02-22,Improvement in hanging crank-shafts,"[Improvement, in, hanging, crank-shafts]","[(Improvement, NN), (in, IN), (hanging, VBG), (crank-shafts, NNS)]",0,"[NOUN, ADP, VERB, NOUN, PUNCT, NOUN]",[],0,0


In [199]:
len(df[df['is_person_spacy'] == 1])

1596

In [200]:
len(df[df['is_prpn_spacy'] == 1])

3793

In [201]:
len(df[df['is_prpn_nltk'] == 1])

2473

We can see that Spacy POS Tags are better since it catches more proper nouns. 

In [202]:
# Saving df for future use
df.to_csv(f'../data/cleaned/{file}', index=False)