In [1]:
import re
import numpy as np
import pandas as pd

from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#### Loading content

In [2]:
patents = pd.read_csv('patents_data.csv')
patents.shape

(300, 6)

In [3]:
patents.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content
0,387659,\n\n SYSTEMS AND METHODS FO...,\n The present disclosure provides ...,systems and methods for adjusting the output o...,the present disclosure provides systems and me...,systems and methods for adjusting the output o...
1,10729058,\n\n Systems and methods fo...,\n The present disclosure provides ...,systems and methods for adjusting the output o...,the present disclosure provides systems and me...,systems and methods for adjusting the output o...
2,6745128,\n\n Methods and systems fo...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...
3,6549852,\n\n Methods and systems fo...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...
4,18431,\n\n METHODS AND SYSTEMS FO...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...


In [4]:
patents['content'].tolist()[0]

'systems and methods for adjusting the output of a field measurement system to conform to agronomy measurements the present disclosure provides systems and methods for adjusting the output of a field measurement system to conform to agronomy measurements. in particular, the present subject matter is directed to a calibration process and system that uses a calibration model to convert field measurement data expressed according to an automatic system metric into agronomy data that is expressed according to an agronomy metric.'

#### Loading dictionary

In [5]:
# todo remover a primeira coluna ao criar o dicionario na geracao de dicionario

dictionary = pd.read_csv('dictionary.csv')
dictionary = dictionary[dictionary.columns.tolist()[1:]]
dictionary.shape

(974, 3)

In [6]:
dictionary.head()

Unnamed: 0,topic,term,feature
0,water_soil_plant,water,water
1,water_soil_plant,water,H2O
2,water_soil_plant,water,body_of_water
3,seed_plant_method,water,water
4,seed_plant_method,water,H2O


In [7]:
# todo passar isso para a geracao de dicionario

porter = PorterStemmer()
dictionary['feature_stem'] = dictionary['feature'].apply(lambda x : porter.stem(x))
dictionary.head()

Unnamed: 0,topic,term,feature,feature_stem
0,water_soil_plant,water,water,water
1,water_soil_plant,water,H2O,h2o
2,water_soil_plant,water,body_of_water,body_of_wat
3,seed_plant_method,water,water,water
4,seed_plant_method,water,H2O,h2o


In [8]:
def preprocess_text(word_list):
    
    out_list = []
    for word in word_list:
        if word.isalpha() and word not in stopwords.words('english'):
            
            out_list.append(porter.stem(word))
    
    return out_list

In [9]:
idx_    = []
tokens_ = []
idx_content = zip(patents['idx'].tolist(), patents['content'].tolist())
for idx, content in idx_content:
    doc_tokens = tokenize.word_tokenize(content)
    doc_tokens = preprocess_text(doc_tokens)
    idx_    = idx_ + ([idx] * len(doc_tokens))
    tokens_ = tokens_ + doc_tokens

df_idx_tokens = pd.DataFrame(zip(idx_, tokens_), columns = ['idx', 'token'])
df_idx_tokens = df_idx_tokens.drop_duplicates()

In [10]:
df_tokens_dictionary = pd.merge(df_idx_tokens, dictionary, 
                                how = 'inner', 
                                left_on = 'token', 
                                right_on= 'feature_stem')
df_tokens_dictionary = df_tokens_dictionary.drop_duplicates()
df_tokens_dictionary = df_tokens_dictionary.drop(['term', 'feature', 'feature_stem'], axis = 1)
df_tokens_dictionary.shape

(10233, 3)

In [11]:
df_tokens_dictionary.head()

Unnamed: 0,idx,token,topic
0,387659,system,method_crop_use
1,387659,system,plurality_equipment_datum
2,387659,system,level_feature_maintain
3,10729058,system,method_crop_use
4,10729058,system,plurality_equipment_datum


In [12]:
idx_unique = df_tokens_dictionary['idx'].unique().tolist()

In [13]:
idx_topic = []
for idx in idx_unique:
    df_tmp = df_tokens_dictionary[df_tokens_dictionary['idx'] == idx]
    topic_unique = df_tmp['topic'].unique().tolist()
    idx_topic.append([idx, '; '.join(topic_unique)])
df_idx_topic = pd.DataFrame(idx_topic, columns=['idx', 'topic'])
df_idx_topic.shape

(300, 2)

In [14]:
df_idx_topic.head()

Unnamed: 0,idx,topic
0,387659,method_crop_use; plurality_equipment_datum; le...
1,10729058,method_crop_use; plurality_equipment_datum; le...
2,6745128,method_crop_use; plurality_equipment_datum; le...
3,6549852,method_crop_use; plurality_equipment_datum; le...
4,18431,method_crop_use; plurality_equipment_datum; le...


In [15]:
# Juntando os topicos as patentes
df_patents = pd.merge(patents, df_idx_topic, how = 'inner', on = 'idx')
df_patents = df_patents.drop_duplicates()
df_patents.shape

(300, 7)

In [16]:
df_patents.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content,topic
0,387659,\n\n SYSTEMS AND METHODS FO...,\n The present disclosure provides ...,systems and methods for adjusting the output o...,the present disclosure provides systems and me...,systems and methods for adjusting the output o...,method_crop_use; plurality_equipment_datum; le...
1,10729058,\n\n Systems and methods fo...,\n The present disclosure provides ...,systems and methods for adjusting the output o...,the present disclosure provides systems and me...,systems and methods for adjusting the output o...,method_crop_use; plurality_equipment_datum; le...
2,6745128,\n\n Methods and systems fo...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...,method_crop_use; plurality_equipment_datum; le...
3,6549852,\n\n Methods and systems fo...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...,method_crop_use; plurality_equipment_datum; le...
4,18431,\n\n METHODS AND SYSTEMS FO...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...,method_crop_use; plurality_equipment_datum; le...


In [17]:
df_patents = df_patents.rename({'topic' : '_topic_'}, axis = 1)
df_patents.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content,_topic_
0,387659,\n\n SYSTEMS AND METHODS FO...,\n The present disclosure provides ...,systems and methods for adjusting the output o...,the present disclosure provides systems and me...,systems and methods for adjusting the output o...,method_crop_use; plurality_equipment_datum; le...
1,10729058,\n\n Systems and methods fo...,\n The present disclosure provides ...,systems and methods for adjusting the output o...,the present disclosure provides systems and me...,systems and methods for adjusting the output o...,method_crop_use; plurality_equipment_datum; le...
2,6745128,\n\n Methods and systems fo...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...,method_crop_use; plurality_equipment_datum; le...
3,6549852,\n\n Methods and systems fo...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...,method_crop_use; plurality_equipment_datum; le...
4,18431,\n\n METHODS AND SYSTEMS FO...,\n Methods and systems for characte...,methods and systems for managing farmland,methods and systems for characterizing and man...,methods and systems for managing farmland meth...,method_crop_use; plurality_equipment_datum; le...


In [18]:
df_patents.to_csv('database.csv', index = False)