In [4]:
import sys
import pandas

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import ensemble
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
# Se verfica entorno de ejecución
in_colab = "google.colab" in sys.modules

if in_colab:
    from google.colab import drive

    drive.mount("/content/drive")
    BASE_DIR = "/content/drive/My Drive/Diplo2020 Mentoria/"
else:
    BASE_DIR = "../"

In [6]:
train_data = BASE_DIR + "corpus3.csv"
dataset = pandas.read_csv(train_data)
dataset.head()

Unnamed: 0.1,Unnamed: 0,TEXTO,DOCUMENTO,TIPO
0,0,sala electoral comp.originaria tribunal superi...,../Datos/Electoral//A 001-2018 COOPI C MUN CAR...,AUTO
1,1,sala electoral comp.originaria tribunal superi...,../Datos/Electoral//A 002-2018 Denuncia R R RA...,AUTO
2,2,sala electoral comp.originaria tribunal superi...,../Datos/Electoral//A 003-2018 VEDIA FLORES Cr...,AUTO
3,3,sala electoral comp.originaria tribunal superi...,../Datos/Electoral//A 004-2018 FERNANDEZ Favio...,AUTO
4,4,sala electoral comp.originaria tribunal superi...,../Datos/Electoral//A 005-2018 ATANOR ADI ref.pdf,AUTO


In [7]:
dataset['TIPO'].value_counts()

AUTO         125
SENTENCIA     25
Name: TIPO, dtype: int64

Vectorizamos los textos

In [8]:
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(dataset['TEXTO'])

<150x19220 sparse matrix of type '<class 'numpy.float64'>'
	with 204569 stored elements in Compressed Sparse Row format>

In [9]:
# División entre instancias vectorizadas y etiquetas
X, y = vectorizer.transform(dataset["TEXTO"]), dataset["TIPO"]

In [10]:
# división entre entrenamiento y evaluación
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True)

Entrenamos con el modelo Random Forest

In [11]:
  model = ensemble.RandomForestClassifier(random_state=0)
  model.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
y_train_pred = model.predict(X_train)

In [13]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

        AUTO       0.99      1.00      0.99        86
   SENTENCIA       1.00      0.95      0.97        19

   micro avg       0.99      0.99      0.99       105
   macro avg       0.99      0.97      0.98       105
weighted avg       0.99      0.99      0.99       105



In [14]:
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
print(f'Matriz de Confusión \n\tTP:{tp:{3}} \tFP:{fp:{3}} \n\tTN:{tn:{3}} \tFN:{fn:{3}}')

Matriz de Confusión 
	TP: 18 	FP:  0 
	TN: 86 	FN:  1


Valido el modelo

In [15]:
y_test_pred = model.predict(X_test)

In [16]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        AUTO       0.95      1.00      0.97        39
   SENTENCIA       1.00      0.67      0.80         6

   micro avg       0.96      0.96      0.96        45
   macro avg       0.98      0.83      0.89        45
weighted avg       0.96      0.96      0.95        45



In [17]:
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
print(f'Matriz de Confusión \n\tTP:{tp:{3}} \tFP:{fp:{3}} \n\tTN:{tn:{3}} \tFN:{fn:{3}}')    

Matriz de Confusión 
	TP:  4 	FP:  0 
	TN: 39 	FN:  2


Ahora quiero predecir un documento nuevo... por ejemplo tomo el primer documento 

In [18]:
dataset[0:1].TIPO, dataset[0:1].TEXTO

(0    AUTO
 Name: TIPO, dtype: object,
 0    sala electoral comp.originaria tribunal superi...
 Name: TEXTO, dtype: object)

El texto ya vectorizado anteriormente es el siguiente:

In [19]:
X_train[0:1]

<1x19220 sparse matrix of type '<class 'numpy.float64'>'
	with 1029 stored elements in Compressed Sparse Row format>

Si predigo ese documento:

In [20]:
model.predict(X_train[0:1])

array(['AUTO'], dtype=object)

Predice correctamente.

Ahora bien, que pasa si quiero predecir un documento que no está en los ya vectorizados. Tengo que primero vectorizarlo para poder aplicarle el modelo de clasificación.

In [21]:
vectorizer.transform(dataset[0:1].TEXTO)

<1x19220 sparse matrix of type '<class 'numpy.float64'>'
	with 767 stored elements in Compressed Sparse Row format>

In [22]:
model.predict(vectorizer.transform(dataset[0:1].TEXTO))

array(['AUTO'], dtype=object)