In [1]:
import re
import numpy as np
import pandas as pd

from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#### Loading content

In [2]:
patents = pd.read_csv('patents_data.csv')
patents.shape

(50, 6)

In [3]:
patents.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content
0,5969222,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5153bm3 is di...,corn inbred lines for dairy cattle feed an inb...
1,6114609,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5253bm3 is di...,corn inbred lines for dairy cattle feed an inb...
2,5859353,\n\n Corn Inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5252bm3 is di...,corn inbred lines for dairy cattle feed an inb...
3,5714670,\n\n Soybeans having low li...,\n A novel soybean seed and plant d...,soybeans having low linolenic acid and low pal...,a novel soybean seed and plant designated ax77...,soybeans having low linolenic acid and low pal...
4,5763745,\n\n Soybeans having low li...,\n Methods are described for the pr...,soybeans having low linolenic acid content and...,methods are described for the production of so...,soybeans having low linolenic acid content and...


In [4]:
patents['content'].tolist()[0]

'corn inbred lines for dairy cattle feed an inbred corn line designated ar5153bm3 is disclosed. ar5153bm3 is useful for producing f1 hybrid corn seed and plants therefrom. such corn plants exhibit the brown midrib phenotype and yield a silage that, when fed to dairy cattle, increases milk production.'

#### Loading dictionary

In [5]:
# todo remover a primeira coluna ao criar o dicionario na geracao de dicionario

dictionary = pd.read_csv('dictionary.csv')
dictionary = dictionary[dictionary.columns.tolist()[1:]]
dictionary.shape

(954, 3)

In [6]:
dictionary.head()

Unnamed: 0,topic,term,feature
0,composition_cyanamide_use,composition,composition
1,composition_cyanamide_use,composition,composing
2,composition_cyanamide_use,composition,constitution
3,composition_cyanamide_use,composition,physical_composition
4,composition_cyanamide_use,composition,makeup


In [7]:
# todo passar isso para a geracao de dicionario

porter = PorterStemmer()
dictionary['feature_stem'] = dictionary['feature'].apply(lambda x : porter.stem(x))
dictionary.head()

Unnamed: 0,topic,term,feature,feature_stem
0,composition_cyanamide_use,composition,composition,composit
1,composition_cyanamide_use,composition,composing,compos
2,composition_cyanamide_use,composition,constitution,constitut
3,composition_cyanamide_use,composition,physical_composition,physical_composit
4,composition_cyanamide_use,composition,makeup,makeup


In [8]:
def preprocess_text(word_list):
    
    out_list = []
    for word in word_list:
        if word.isalpha() and word not in stopwords.words('english'):
            
            out_list.append(porter.stem(word))
    
    return out_list

In [9]:
idx_    = []
tokens_ = []
idx_content = zip(patents['idx'].tolist(), patents['content'].tolist())
for idx, content in idx_content:
    doc_tokens = tokenize.word_tokenize(content)
    doc_tokens = preprocess_text(doc_tokens)
    idx_    = idx_ + ([idx] * len(doc_tokens))
    tokens_ = tokens_ + doc_tokens

df_idx_tokens = pd.DataFrame(zip(idx_, tokens_), columns = ['idx', 'token'])
df_idx_tokens = df_idx_tokens.drop_duplicates()

In [17]:
df_tokens_dictionary = pd.merge(df_idx_tokens, dictionary, 
                                how = 'inner', 
                                left_on = 'token', 
                                right_on= 'feature_stem')
df_tokens_dictionary = df_tokens_dictionary.drop_duplicates()
df_tokens_dictionary = df_tokens_dictionary.drop(['term', 'feature', 'feature_stem'], axis = 1)
df_tokens_dictionary.shape

(2118, 3)

In [18]:
df_tokens_dictionary.head()

Unnamed: 0,idx,token,topic
0,5969222,corn,plant_inbreed_corn
1,5969222,corn,corn_plant_seed
2,5969222,corn,inbreed_plant_corn
3,5969222,corn,corn_plant_group
4,5969222,corn,plant_corn_inbreed


In [25]:
idx_unique = df_tokens_dictionary['idx'].unique().tolist()

In [44]:
idx_topic = []
for idx in idx_unique:
    df_tmp = df_tokens_dictionary[df_tokens_dictionary['idx'] == idx]
    topic_unique = df_tmp['topic'].unique().tolist()
    idx_topic.append([idx, '; '.join(topic_unique)])
df_idx_topic = pd.DataFrame(idx_topic, columns=['idx', 'topic'])
df_idx_topic.shape

(50, 2)

In [45]:
df_idx_topic.head()

Unnamed: 0,idx,topic
0,5969222,plant_inbreed_corn; corn_plant_seed; inbreed_p...
1,6114609,plant_inbreed_corn; corn_plant_seed; inbreed_p...
2,5859353,plant_inbreed_corn; corn_plant_seed; inbreed_p...
3,4069614,plant_inbreed_corn; corn_plant_seed; inbreed_p...
4,5706603,plant_inbreed_corn; corn_plant_seed; inbreed_p...


In [47]:
# Juntando os topicos as patentes
df_patents = pd.merge(patents, df_idx_topic, how = 'inner', on = 'idx')
df_patents = df_patents.drop_duplicates()
df_patents.shape

(50, 7)

In [48]:
df_patents.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content,topic
0,5969222,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5153bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
1,6114609,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5253bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
2,5859353,\n\n Corn Inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5252bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
3,5714670,\n\n Soybeans having low li...,\n A novel soybean seed and plant d...,soybeans having low linolenic acid and low pal...,a novel soybean seed and plant designated ax77...,soybeans having low linolenic acid and low pal...,composition_cyanamide_use; plant_invention_pot...
4,5763745,\n\n Soybeans having low li...,\n Methods are described for the pr...,soybeans having low linolenic acid content and...,methods are described for the production of so...,soybeans having low linolenic acid content and...,acid_rice_content; soybean_content_acid; acid_...


In [49]:
df_patents = df_patents.rename({'topic' : '_topic_'}, axis = 1)
df_patents.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content,_topic_
0,5969222,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5153bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
1,6114609,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5253bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
2,5859353,\n\n Corn Inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5252bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
3,5714670,\n\n Soybeans having low li...,\n A novel soybean seed and plant d...,soybeans having low linolenic acid and low pal...,a novel soybean seed and plant designated ax77...,soybeans having low linolenic acid and low pal...,composition_cyanamide_use; plant_invention_pot...
4,5763745,\n\n Soybeans having low li...,\n Methods are described for the pr...,soybeans having low linolenic acid content and...,methods are described for the production of so...,soybeans having low linolenic acid content and...,acid_rice_content; soybean_content_acid; acid_...


In [52]:
df_patents.to_csv('database.csv', index = False)