In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
import nltk

import warnings
warnings.filterwarnings("ignore")

In [156]:
def build_nltk_pos_tags(df):
    # converting title to str type
    df['title'] = df['title'].astype(str)
    # splitting title into tokens
    df['tokens'] = df['title'].apply(lambda x: x.split())
    # getting nltk pos tags from tokens
    df['nltk_pos'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
    return df

df = build_nltk_pos_tags(df)
df.head()

In [5]:
df = pd.read_csv('../data/g_patent.tsv', sep='\t')
df.head()

Unnamed: 0,patent_id,patent_type,patent_date,patent_title,patent_abstract,wipo_kind,num_claims,withdrawn,filename
0,10000000,utility,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,A frequency modulated (coherent) laser detecti...,B2,20,0,ipg180619.xml
1,10000001,utility,2018-06-19,Injection molding machine and mold thickness c...,The injection molding machine includes a fixed...,B2,12,0,ipg180619.xml
2,10000002,utility,2018-06-19,Method for manufacturing polymer film and co-e...,The present invention relates to: a method for...,B2,9,0,ipg180619.xml
3,10000003,utility,2018-06-19,Method for producing a container from a thermo...,The invention relates to a method for producin...,B2,18,0,ipg180619.xml
4,10000004,utility,2018-06-19,"Process of obtaining a double-oriented film, c...",The present invention relates to provides a do...,B2,6,0,ipg180619.xml


In [6]:
df = df[['patent_id', 'patent_date', 'patent_title']]

In [7]:
df.head()

Unnamed: 0,patent_id,patent_date,patent_title
0,10000000,2018-06-19,Coherent LADAR using intra-pixel quadrature de...
1,10000001,2018-06-19,Injection molding machine and mold thickness c...
2,10000002,2018-06-19,Method for manufacturing polymer film and co-e...
3,10000003,2018-06-19,Method for producing a container from a thermo...
4,10000004,2018-06-19,"Process of obtaining a double-oriented film, c..."


In [158]:
def get_ngrams(df, ngram_from=2, ngram_to=2, n=None, max_features=20000):
    
    additional_stop_words = ['below', 'having', 'over', 'employing', 'as well as', 'incorporating', 'containing', 'wherein', 'and/or', 'over', 'based on', 'comprising']
    stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)
    vec = CountVectorizer(ngram_range = (ngram_from, ngram_to), 
                          max_features = max_features, 
                          stop_words=df['title']).fit(df['title'])
    bag_of_words = vec.transform(df['title'])
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
   
    return words_freq[:n]

In [8]:
df.head()

Unnamed: 0,patent_id,patent_date,patent_title
0,10000000,2018-06-19,Coherent LADAR using intra-pixel quadrature de...
1,10000001,2018-06-19,Injection molding machine and mold thickness c...
2,10000002,2018-06-19,Method for manufacturing polymer film and co-e...
3,10000003,2018-06-19,Method for producing a container from a thermo...
4,10000004,2018-06-19,"Process of obtaining a double-oriented film, c..."


In [160]:
# import spacy
# nlp = spacy.load("en_core_web_sm")

# df['date'] = pd.to_datetime(df['date'])

# def get_ncs(title):
#     doc = nlp(title)
#     return [chunk.text for chunk in doc.noun_chunks]
    
# def get_ngrams_by_year(df, year, ngram_from=2, ngram_to=3, n=None, max_features=20000):
#     # get n-grams based on year of df
#     df['date'] = pd.to_datetime(df['date'])
#     df_year = df[df['date'].dt.year == year][['patent_number', 'date', 'title', 'title_singular']]
#     if len(df_year) > 0:
#         additional_stop_words = ['below', 'having', 'over', 'employing', 'as well as', 'incorporating', 'containing', 'wherein', 'and/or', 'over', 'method', 'based on', 'comprising', 'machine', 'apparatus', 'device', 'mechanism', 'process', 'making', 'attachment', 'thereof']
#         stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)
#         vec = CountVectorizer(ngram_range = (ngram_from, ngram_to), 
#                             max_features = max_features, 
#                             stop_words=stop_words).fit(df_year['title_singular'])
#         bag_of_words = vec.transform(df_year['title_singular'])
#         sum_words = bag_of_words.sum(axis = 0)
#         # words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
#         # words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

#         df_year['noun_chunks'] = df_year['title_singular'].apply(get_ncs)
#         df_year['noun_chunks_freq'] =  df_year['noun_chunks'].apply(lambda ncs: sorted([(nc, sum_words[0, vec.vocabulary_[nc]]) for nc in ncs if nc in vec.vocabulary_], key = lambda x: x[1], reverse = True))
#         return df_year
#     return list()

In [9]:
import spacy
nlp = spacy.load("en_core_web_sm")

df['patent_date'] = pd.to_datetime(df['patent_date'])

def get_ncs(title):
    doc = nlp(title)
    return [chunk.text for chunk in doc.noun_chunks]
    
def get_ngrams_by_year(df, year, ngram_from=2, ngram_to=3, n=None, max_features=20000):
    # get n-grams based on year of df
    df['patent_date'] = pd.to_datetime(df['patent_date'])
    df_year = df[df['patent_date'].dt.year == year]
    if len(df_year) > 0:
        additional_stop_words = ['below', 'having', 'over', 'employing', 'as well as', 'incorporating', 'containing', 'wherein', 'and/or', 'over', 'method', 'based on', 'comprising', 'machine', 'apparatus', 'device', 'mechanism', 'process', 'making', 'attachment', 'thereof']
        stop_words = text.ENGLISH_STOP_WORDS.union(additional_stop_words)
        vec = CountVectorizer(ngram_range = (ngram_from, ngram_to), 
                            max_features = max_features, 
                            stop_words=stop_words).fit(df_year['patent_title'])
        bag_of_words = vec.transform(df_year['patent_title'])
        sum_words = bag_of_words.sum(axis = 0)
        # words_freq = [(word, sum_words[0, i]) for word, i in vec.vocabulary_.items()]
        # words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

        df_year['noun_chunks'] = df_year['patent_title'].apply(get_ncs)
        df_year['noun_chunks_freq'] =  df_year['noun_chunks'].apply(lambda ncs: sorted([(nc, sum_words[0, vec.vocabulary_[nc]]) for nc in ncs if nc in vec.vocabulary_], key = lambda x: x[1], reverse = True))
        return df_year
    return list()

In [14]:
from tqdm import tqdm

# df_dict = {}
# for year in tqdm(list(df['date'].dt.year.unique())):
#     df_year = get_ngrams_by_year(df[~df['title_singular'].isna()], year, ngram_from=2, ngram_to=3)
#     # if len(year_word_freq) > 0:
#     # year_words = [w[:-1] if (w.endswith('s') and (nltk.pos_tag(w.split())[-1][-1] == 'NNS')) else w for w, _ in year_word_freq ]
#     # df_year = pd.DataFrame(list(set(year_words)))
#     # df_dict[year] = df_year[0]
#     df_year.to_csv(f'../data/by_year/ngrams_by_year_{year}')


df_dict = {}
for year in tqdm(list(df['patent_date'].dt.year.unique())[1:]):
    df_year = get_ngrams_by_year(df[~df['patent_title'].isna()], year, ngram_from=2, ngram_to=3)
    # if len(year_word_freq) > 0:
    # year_words = [w[:-1] if (w.endswith('s') and (nltk.pos_tag(w.split())[-1][-1] == 'NNS')) else w for w, _ in year_word_freq ]
    # df_year = pd.DataFrame(list(set(year_words)))
    # df_dict[year] = df_year[0]
    df_year.to_csv(f'../data/by_year/ngrams_by_year_{year}')

100%|██████████| 46/46 [16:39:51<00:00, 1304.17s/it]    


In [11]:
df_year

Unnamed: 0,patent_id,patent_date,patent_title,noun_chunks,noun_chunks_freq
0,10000000,2018-06-19,Coherent LADAR using intra-pixel quadrature de...,"[Coherent LADAR, intra-pixel quadrature detect...",[]
1,10000001,2018-06-19,Injection molding machine and mold thickness c...,"[Injection molding machine, mold thickness con...",[]
2,10000002,2018-06-19,Method for manufacturing polymer film and co-e...,"[Method, polymer film, co-extruded film]","[(polymer film, 31)]"
3,10000003,2018-06-19,Method for producing a container from a thermo...,"[Method, a container, a thermoplastic]",[]
4,10000004,2018-06-19,"Process of obtaining a double-oriented film, c...","[Process, a double-oriented film, low thicknes...",[]
...,...,...,...,...,...
8167340,RE47179,2018-12-25,"Polarization device, method of manufacturing t...","[Polarization device, method, the same, liquid...",[]
8167341,RE47180,2018-12-25,Apparatus and method for generating a bandwidt...,"[Apparatus, method, a bandwidth extended signal]",[]
8167342,RE47181,2018-12-25,Light emitting device,[Light emitting device],[]
8167343,RE47182,2018-12-25,Methods and apparatus for fast and energy-effi...,"[Methods, apparatus, fast and energy-efficient...",[]


In [91]:
# d = {}
# for k, v in df_dict.items():
#     l = []
#     for w in v:
#         if (w.endswith('s') and (nltk.pos_tag(w.split())[-1][-1] == 'NNS')):
#             l.append(w[:-1])
#         else:
#             l.append(w)
#     d[k] = l

In [92]:
df_years = pd.DataFrame.from_dict(df_dict, orient='index')
df_years.head(10)

In [102]:
df_years.to_csv('../data/top_technologies_by_year_bi&trigrams_final.csv', index=True)

In [103]:
df_years.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1972,flame retardant,internal combustion engine,liquid crystal,solid state,combustion engine,gas turbine,cathode ray,control valve,motor vehicle,sheet material
1973,exhaust gas,internal combustion engine,liquid crystal,solid state,combustion engine,gas turbine,cathode ray,control valve,motor vehicle,sheet material
1974,internal combustion engine,liquid crystal,solid state,combustion engine,rotary engine,gas turbine,flow control,control valve,cathode ray,motor vehicle
1975,power transmission,internal combustion engine,solid state,combustion engine,gas turbine,flow control,control valve,cathode ray,disc brake,sheet material
1976,vinyl chloride,type recovery,smoke retardant vinyl,cathode ray,recovery tool,type recovery tool,polymer composition,chloride polymer,high temperature,window sash


In [96]:
df_years.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1972,flame retardant,internal combustion engine,liquid crystal,solid state,combustion engine,gas turbine,cathode ray,control valve,motor vehicle,sheet material
1973,exhaust gas,internal combustion engine,liquid crystal,solid state,combustion engine,gas turbine,cathode ray,control valve,motor vehicle,sheet material
1974,internal combustion engine,liquid crystal,solid state,combustion engine,rotary engine,gas turbine,flow control,control valve,cathode ray,motor vehicle
1975,power transmission,internal combustion engine,solid state,combustion engine,gas turbine,flow control,control valve,cathode ray,disc brake,sheet material
1976,vinyl chloride,type recovery,smoke retardant vinyl,cathode ray,recovery tool,type recovery tool,polymer composition,chloride polymer,high temperature,window sash


In [55]:
len(df_years)

77

In [89]:
df_years.iloc[3]

0            rotary engine
1      alternating current
2            steam turbine
3              type writer
4              railway car
5             water heater
6          electric switch
7         electric railway
8           sewing machine
9    non refillable bottle
Name: 1903, dtype: object