In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# import xgboost as xgb

from sklearn.feature_selection import RFE
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords



In [2]:
database = pd.read_csv('database.csv')
database.head()

Unnamed: 0,idx,title_raw,text_raw,title,text,content,_topic_
0,5969222,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5153bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
1,6114609,\n\n Corn inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5253bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
2,5859353,\n\n Corn Inbred lines for ...,\n An inbred corn line designated A...,corn inbred lines for dairy cattle feed,an inbred corn line designated ar5252bm3 is di...,corn inbred lines for dairy cattle feed an inb...,plant_inbreed_corn; corn_plant_seed; inbreed_p...
3,5714670,\n\n Soybeans having low li...,\n A novel soybean seed and plant d...,soybeans having low linolenic acid and low pal...,a novel soybean seed and plant designated ax77...,soybeans having low linolenic acid and low pal...,composition_cyanamide_use; plant_invention_pot...
4,5763745,\n\n Soybeans having low li...,\n Methods are described for the pr...,soybeans having low linolenic acid content and...,methods are described for the production of so...,soybeans having low linolenic acid content and...,acid_rice_content; soybean_content_acid; acid_...


In [3]:
database_train = database[database['_topic_'].notna()]
# database_ = database[not database['_topic_'].isna()]

In [4]:
vectorizer = TfidfVectorizer()
doc_vec = vectorizer.fit_transform(database_train['content'])

In [5]:
df_doc_vec = pd.DataFrame(doc_vec.toarray(), columns = vectorizer.get_feature_names())
df_doc_vec.shape

(50, 884)

In [6]:
df_doc_vec.head()

Unnamed: 0,01dhd10,09dsq1,10,102,108,11,110,13,14,15,...,would,wqds2,xanthomonas,yield,yielding,yields,zea,zn,zone,μmol
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.127243,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.127243,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.127243,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.039744,0.0,0.039744,0.036761,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
X = df_doc_vec
y = database_train['_topic_'].to_numpy()

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [52]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(37, 884) (13, 884) (37,) (13,)


In [53]:
# teste com random forest

In [54]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.6153846153846154

In [55]:
rf_scores = cross_val_score(rf, X, y, cv=10)
print(rf_scores)
print(np.mean(rf_scores))



[0.6 0.6 0.4 0.2 0.2 0.2 0.4 0.2 0.4 0.4]
0.36


In [56]:
# teste com Naive Bayes

In [57]:
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.6153846153846154

In [58]:
nb_scores = cross_val_score(nb, X, y, cv=10)
print(nb_scores)
print(np.mean(nb_scores))

[0.6 0.6 0.4 0.2 0.2 0.2 0.4 0.2 0.4 0.4]
0.36




In [59]:
# teste com SVM

In [60]:
svm = SVC(C=15, random_state=150, probability=True)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.6153846153846154

In [61]:
svm_scores = cross_val_score(svm, X, y, cv=10)
print(svm_scores)
print(np.mean(svm_scores))



[0.6 0.6 0.4 0.2 0.2 0.2 0.4 0.2 0.4 0.4]
0.36


In [62]:
# Removendo colunas que sejam stopwords

In [63]:
column_names = df_doc_vec.columns.tolist()

In [64]:
keep = []
for column_name in column_names:
    keep.append(column_name not in stopwords.words('english'))
print(len(keep), sum(keep))

884 825


In [65]:
df_doc_vec_filtered = df_doc_vec[df_doc_vec.columns[keep]]

In [26]:
X1 = df_doc_vec_filtered
X1.shape

(50, 825)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=42)

In [77]:
# teste com random forest

In [78]:
rf = RandomForestClassifier(random_state=185)

In [79]:
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.38461538461538464

In [80]:
rf_scores = cross_val_score(rf, X1, y, cv=10)
print(rf_scores)
print(np.mean(rf_scores))



[0.6 0.6 0.4 0.2 0.2 0.2 0.4 0.2 0.4 0.4]
0.36


In [32]:
# Removendo caracteristicas

In [81]:
model = RandomForestClassifier(n_estimators=100)

In [34]:
rfe = RFE(model, n_features_to_select=5)
rfe.fit(X1, y)

RFE(estimator=RandomForestClassifier(), n_features_to_select=5)

In [82]:
X1.columns[rfe.support_]

Index(['invention', 'plant', 'plants', 'provided', 'seed'], dtype='object')

In [83]:
X1_rfe = X1[X1.columns[rfe.support_]]
X1_rfe.shape

(50, 5)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X1_rfe, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(37, 5) (13, 5) (37,) (13,)


In [38]:
# teste com random forest

In [39]:
rf = RandomForestClassifier(random_state=185)

In [40]:
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.29411764705882354

In [41]:
rf_scores = cross_val_score(rf, X1_rfe, y, cv=10)
print(rf_scores)
print(np.mean(rf_scores))



[0.6 0.6 0.4 0.2 0.2 0.2 0.4 0.2 0.4 0.4]
0.36
