# Testing Naive Bayes Classifier

In this notebook, we use a method of supervised learning to attempt and classify patents into predefined categories

In [1]:
# quick eda
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# 1. load data in
df = pd.read_csv(
    "/Users/lilahduboff/Documents/Duke NLP/Final Project/Natural_Language_Processing_Final_Project/Data/subsidiary_patents.csv"
)

df.shape

(99106, 13)

In [3]:
df.sample(20)

Unnamed: 0,SYMBOL,SUBSIDIARY,patent_id,patent_title,patent_date,patent_year,patent_type,withdrawn,assignee_organization,ipc_sections,ipc_classes,ipc_subclasses,patent_abstract
64106,PG,The Procter & Gamble Company,9084699,Absorbent article,2015-07-21,2015,utility,False,The Procter & Gamble Company,A,61,F,A disposable absorbent article that includes a...
10224,DIS,"Disney Enterprises, Inc.",11991417,Systems and methods for intelligent media cont...,2024-05-21,2024,utility,False,"Disney Enterprises, Inc.",G;H,11;4,B;N,There is provided a system including a non-tra...
66310,PG,The Procter & Gamble Company,D531509,Container and a cap,2006-11-07,2006,design,False,The Procter & Gamble Company,,,,
58587,PG,The Procter & Gamble Company,5968496,Cosmetic compositions comprising an imidazoliu...,1999-10-19,1999,utility,False,The Procter & Gamble Company,A,61,K,An aqueous foaming cosmetic composition compri...
88041,CSCO,Splunk Inc.,D963676,Display screen with graphical user interface,2022-09-13,2022,design,False,SPLUNK Inc.,,,,
50804,PG,BRAUN GmbH,6820941,Clamping devices,2004-11-23,2004,utility,False,Braun GMBH,A,46,D,A clamping device for restraining filament tuf...
91175,HON,UOP LLC,4050315,Remotely actuated sampling apparatus,1977-09-27,1977,utility,False,UOP LLC,G,01,N,Apparatus for taking liquid samples in a well ...
59739,PG,The Procter & Gamble Company,6409615,Golf ball with non-circular shaped dimples,2002-06-25,2002,utility,False,The Procter & Gamble Company,A,63,B,In a non-limiting exemplary embodiment of the ...
77667,VZ,Verizon Patent and Licensing Inc.,8350721,Geographically specific emergency notification,2013-01-08,2013,utility,False,Verizon Patent and Licensing Inc.,G,01;08,C;G,A mobile device is associated with navigationa...
61608,PG,The Procter & Gamble Company,7322967,Pant-type disposable garment,2008-01-29,2008,utility,False,The Procter & Gamble Company,A,61,F,A pant-type disposable garment having a waist ...


In [6]:
print(df["patent_abstract"].isnull().sum())

16304


In [7]:
df = df.dropna(subset=["patent_abstract"])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82802 entries, 0 to 99105
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   SYMBOL                 82802 non-null  object
 1   SUBSIDIARY             82802 non-null  object
 2   patent_id              82802 non-null  object
 3   patent_title           82802 non-null  object
 4   patent_date            82802 non-null  object
 5   patent_year            82802 non-null  int64 
 6   patent_type            82802 non-null  object
 7   withdrawn              82802 non-null  bool  
 8   assignee_organization  82802 non-null  object
 9   ipc_sections           82782 non-null  object
 10  ipc_classes            82782 non-null  object
 11  ipc_subclasses         82782 non-null  object
 12  patent_abstract        82802 non-null  object
dtypes: bool(1), int64(1), object(11)
memory usage: 8.3+ MB


In [10]:
df["patent_date"] = pd.to_datetime(df["patent_date"], errors="coerce")

In [12]:
df = df.sort_values(["SUBSIDIARY", "patent_date"], ascending=[True, False])
top500_subset = df.groupby("SUBSIDIARY").head(500)

top500_subset.shape

(21644, 13)

In [13]:
top500_subset.head()

Unnamed: 0,SYMBOL,SUBSIDIARY,patent_id,patent_title,patent_date,patent_year,patent_type,withdrawn,assignee_organization,ipc_sections,ipc_classes,ipc_subclasses,patent_abstract
25759,MMM,"3M COGENT, INC.",8411916,Bio-reader device with ticket identification,2013-04-02,2013,utility,False,"3M Cogent, Inc.",G,06,K,A method and device for determining a concentr...
25758,MMM,"3M COGENT, INC.",8379982,System and method for fast biometric pattern m...,2013-02-19,2013,utility,False,"3M Cogent, Inc.",G,06,K,A method and system for matching two biometric...
25757,MMM,"3M COGENT, INC.",8275179,Apparatus for capturing a high quality image o...,2012-09-25,2012,utility,False,"3M Cogent, Inc.",G,06,K,An apparatus for capturing the image of a wet/...
25756,MMM,"3M COGENT, INC.",8254728,Method and apparatus for two dimensional image...,2012-08-28,2012,utility,False,"3M Cogent, Inc.",G,06,K,"In one embodiment, the present invention is a ..."
25755,MMM,"3M COGENT, INC.",8131477,Method and device for image-based biological d...,2012-03-06,2012,utility,False,"3M Cogent, Inc.",G,01;06,F;K;N,A device and method for determining a concentr...


In [14]:
# export to data folder
top500_subset.to_csv("data/top500_patents.csv", index=False)

In [None]:
#1. load data in 

#2. preprocess data
#   - remove special characters
#   - remove stop words
#   - convert to lowercase

#3. TfidfVectorizer: Calculates Term Frequency-Inverse Document Frequency, which gives more weight to words that are important in a specific document but less common across the entire corpus

#4. split data into training and testing sets

#5. train model

#6. evaluate model

#7. make predictions



In [None]:
# 8. visualize results

