A method for visualizing the pantent landscape.

- Search Google Patents for a subject of interest.

- Narrow the search for: 

- The target term or a combination of terms;

- U.S. Patents;

- In English Language;

- Patents that have a status of Granted;

- and any other paramters of interest.

Objective: Vizually understand general themes and clusters of patented technology

In [None]:
#Google Patents Search https://patents.google.com/?q=crispr&country=US&status=GRANT&language=ENGLISH&type=PATENT
#download results as gp-search-20200124-065711.csv
#24 JAN 2020

In [None]:
import pandas as pd #to use pandas

#read in csv file containing raw text data retrieved from patent data source (Google)
df = pd.read_csv('gp-search-20200124-065711-raw_with_claims.csv', index_col=0)

import time #need to have date fields recognized as such

# priority date
df['priority date'] = pd.to_datetime(df['priority date'])
# filing date
df['filing/creation date'] = pd.to_datetime(df['filing/creation date'])
# publication date
df['publication date'] = pd.to_datetime(df['publication date'])
# grant date. Some are not granted, so we tell pandas to ignore if they won't turn into datetime format
df['grant date'] = pd.to_datetime(df['grant date'], errors = 'ignore')

In [None]:
df.head() #verify the dataset - expecting it to be cleaned and processed 

In [None]:
#create a list of words as stop_list to remove during text cleaning

stop_list = ['abstract','abstracts','acceptable','apparatus','apparatuses',\
             'body','cancel','claim','claims','classification',\
             'classifications','comprise','comprises','comprising','composition'\
             'configure','configured','dependent','desire','description',\
             'device','devices','disclose','disclosed','discloses',\
             'embodiment','embodiments','example','examples',\
             'for example','herein','hide','includes','includes','invention',\
             'inventions','inventions','method','produce','present','provide',\
             'provided','provides','said','say','system','systems',\
             'thereof','user','subject','subsequent'] 

print(stop_list)

In [None]:
#import libraries and prep to clean and tokenize raw text


# install and import spacy (look up documentation for spacy)
import spacy

# import English package
from spacy.lang.en import English

# import string library
import string

# import regex to help clean text
import re

# import scikit learn package of English stop words 
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

# assign variable "punctuation" to list of punctuations from string package
punctuation = list(string.punctuation)

# assign variable "parser" to the English function
parser = English()

In [None]:
# Create a function to clean and tokenize raw text
def tokenizeText(text):
    # clean text using regex
    ## create a list of regex expressions and assign variable "separators"
    separators = ["\xa0\xa0\xa0\xa0", "\r", "\n",\
                  "\t", "n't", "'m", "'ll", '[^a-z ]'\
                 '[\s]+',r'[^\w]','^\d+\s|\s\d+\s|\s\d+$']
    
    # iterate over the list of separators
    for i in separators:
        # every time regex finds a match in the text of the claims, delete (replace it with space)
        text = re.sub(i, " ", text.lower())
    
    # parse text using Spacy
    tokens = parser(text)
    tokens = [tok.lemma_.strip() for tok in tokens]
    # get rid of words in the stop list
    
    return [tok for tok in tokens if len(tok) !=1 and tok not in stop]

In [None]:
# create function to turn the list of tokens into one body of text (corpus)
def text_processing(corp):
    # call the tokenizeText function we created above
    corp = tokenizeText(corp)
    return ' '.join(corp)

In [None]:
stop = set(list(stop_list) + list(ENGLISH_STOP_WORDS) + list(punctuation))

In [None]:
# apply the function for each row of texts in the text column of the dataframe
df['claim'] = df['claim'].apply(text_processing)

In [None]:
df['classifications'] = df['classifications'].apply(text_processing)

In [None]:
# take a look at what the function returned
print(df['claim'][5:])
print('')
print(df['classifications'][5:])

In [None]:
#clean up white space, just making sure
df['claim'] = df['claim'].str.strip()
df['classifications'] = df['classifications'].str.strip()

In [None]:
#reset index
df = df.reset_index(drop=True)
df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
                  
tfidf_vectorizer = TfidfVectorizer(
    min_df=0.1,
    max_df=0.9,
    max_features = 5000, 
    stop_words = 'english')
    
tfidf_matrix = tfidf_vectorizer.fit_transform(df.claim)
type(tfidf_matrix)

In [None]:
#https://medium.com/@dmitriy.kavyazin/principal-component-analysis-and-k-means-clustering-to-visualize-a-high-dimensional-dataset-577b2a7a5fe2
from sklearn.decomposition import PCA

pca = PCA(n_components=20)
principalComponents = pca.fit_transform(tfidf_matrix.todense())
PCA_components = pd.DataFrame(principalComponents)

In [None]:
import matplotlib.pyplot as plt

features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color='blue')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features);

In [None]:
#run elbow analysis
from sklearn.cluster import MiniBatchKMeans

ks = range(1, 10)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = MiniBatchKMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(PCA_components.iloc[:,:3])
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.title('Elbow Curve - CRISPR Patent Research')

plt.savefig('PatentViz-CRISPR-Elbow.png')
plt.show()

In [None]:
#use elbow analysis to determine what to set n_clusters to
import numpy as np

clusters = MiniBatchKMeans(n_clusters=4, init_size=1024, batch_size=2048, random_state=20).fit_predict(tfidf_matrix)

In [None]:
def get_top_keywords(data, clusters, labels, n_terms):
    df_gtk = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    #print(df_gtk)
    for i,r in df_gtk.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))

In [None]:
#return clusters and top words
get_top_keywords(tfidf_matrix, clusters, tfidf_vectorizer.get_feature_names(), 4)

In [None]:
#calculate centers
centers = np.array(model.cluster_centers_)
#https://matplotlib.org/gallery/shapes_and_collections/scatter.html

In [None]:
#3d plot clusters 
import plotly.express as px
import plotly.graph_objects as go

fig = px.scatter_3d(x=PCA_components[0], y=PCA_components[1], z=PCA_components[2],
                    color=clusters, hover_name=clusters, 
                    color_continuous_scale=["tomato", "forestgreen", "cornflowerblue", "gold"]
                    )

fig.update_layout(
    height=600, 
    title_text='PCA Cluster Plot - CRISPR Patent Search 3D')
    
    
fig.add_trace(go.Scatter3d(
    x=[centers[0][0], centers[1][0], centers[2][0], centers[3][0], centers[4][0]],
    y=[centers[0][1], centers[1][1], centers[2][1], centers[3][1], centers[4][1]],
    z=[centers[0][2], centers[1][2], centers[2][2], centers[3][2], centers[4][2]],
    name = 'Cluster Center',
    marker_color="black", 
    marker_size=16,
    mode='lines+text',
    text= [' ',' ',' ',' ',' '], 
    textposition = "top center"
    
))

fig.update_traces(textfont_size=12)


#update legend bar title
fig.update_layout(coloraxis_colorbar=dict(
    title="Cluster"))


fig.update_layout(
    scene=go.layout.Scene(
        aspectratio=dict(
            x=1,
            y=1,
            z=1
        ),


        annotations=[dict(
            x=centers[0][0],
            y=centers[0][1],
            z=centers[0][2],
            text="progeny,seed,plant,soybean",
            textangle=0,
            ax=0,
            ay=-75,
            font=dict(
                color="black",
                size=12
            ),
            arrowcolor="black",
            arrowsize=3,
            arrowwidth=1,
            arrowhead=1
        ), dict(
            x=centers[1][0],
            y=centers[1][1],
            z=centers[1][2],
            text="acid,sequence,seq,pron",
            textangle=0,
            ax=-50,
            ay=-75,
            font=dict(
                color="black",
                size=12
            ),
            arrowcolor="black",
            arrowsize=3,
            arrowwidth=1,
            arrowhead=1
        ), dict(
            x=centers[2][0],
            y=centers[2][1],
            z=centers[2][2],
            ax=20,
            ay=-15,
            font=dict(
                color="black",
                size=12
            ),
            text="maize,variety,seed,plant",
            arrowhead=1,
            xanchor="left",
            yanchor="bottom"
        ), dict(
            x=centers[3][0],
            y=centers[3][1],
            z=centers[3][2],
            text="gene,target,sequence,cell",
            textangle=0,
            ax=45,
            ay=-75,
            font=dict(
                color="black",
                size=12
            ),
            arrowcolor="black",
            arrowsize=3,
            arrowwidth=1,
            arrowhead=1
        )
                    ]
    ),
)



fig.update_layout(scene = dict(
                    xaxis_title='PCA 1',
                    yaxis_title='PCA 2',
                    zaxis_title='PCA 3'))

fig.show();


Alternative ways to visualize the same data 

In [None]:
plt.scatter(PCA_components[0], PCA_components[1], alpha=.1, color='black')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('PCA Cluster Plot - CRISPR Patent Search B/W')
plt.savefig('PatentViz-CRISPR-bw');

In [None]:
#3d plot clusters 
import plotly.express as px
import plotly.graph_objects as go

fig = px.scatter(x=PCA_components[0], y=PCA_components[1], color=clusters, hover_name=clusters,
                color_continuous_scale=["tomato", "forestgreen", "cornflowerblue", "gold"])

fig.update_layout(
    height=600, 
    title_text='PCA Cluster Plot - CRISPR Patent Search 2D')

#update legend bar title
fig.update_layout(coloraxis_colorbar=dict(
    title="Cluster"))

fig.show()


Next Project - Deploying graphs to production with Dash.

References: 

https://plot.ly/python/

https://medium.com/plotly/introducing-plotly-express-808df010143d

https://plot.ly/python/text-and-annotations/

https://medium.com/@dmitriy.kavyazin/principal-component-analysis-and-k-means-clustering-to-visualize-a-high-dimensional-dataset-577b2a7a5fe2

https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html

https://plot.ly/python/v3/ipython-notebooks/color-scales/

https://community.plot.ly/t/plotly-colours-list/11730/3
