In [1]:
import sys

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import ensemble
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Se verfica entorno de ejecución
in_colab = "google.colab" in sys.modules

if in_colab:
    from google.colab import drive

    drive.mount("/content/drive")
    BASE_DIR = "/content/drive/My Drive/Diplo2020 Mentoria/"
else:
    BASE_DIR = "../"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
train_data = BASE_DIR + "corpus3.csv"
dataset = pandas.read_csv(train_data)
dataset.head()

Unnamed: 0.1,Unnamed: 0,TEXTO,DOCUMENTO,TIPO
0,0,sala electoral comp.originaria tribunal superi...,/content/drive/My Drive/Diplo2020 Mentoria/Dat...,AUTO
1,1,sala electoral comp.originaria tribunal superi...,/content/drive/My Drive/Diplo2020 Mentoria/Dat...,SENTENCIA
2,2,sala electoral comp.originaria tribunal superi...,/content/drive/My Drive/Diplo2020 Mentoria/Dat...,AUTO
3,3,sala electoral comp.originaria tribunal superi...,/content/drive/My Drive/Diplo2020 Mentoria/Dat...,AUTO
4,4,sala electoral comp.originaria tribunal superi...,/content/drive/My Drive/Diplo2020 Mentoria/Dat...,AUTO


In [4]:
dataset['TIPO'].value_counts()

AUTO         125
SENTENCIA     25
Name: TIPO, dtype: int64

Vectorizamos los textos

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(dataset['TEXTO'])

<150x19224 sparse matrix of type '<class 'numpy.float64'>'
	with 204582 stored elements in Compressed Sparse Row format>

In [6]:
# División entre instancias vectorizadas y etiquetas
X, y = vectorizer.transform(dataset["TEXTO"]), dataset["TIPO"]

In [7]:
# división entre entrenamiento y evaluación
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, shuffle=True)

Entrenamos con el modelo Random Forest

In [8]:
  model = ensemble.RandomForestClassifier(random_state=0)
  model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [9]:
y_train_pred = model.predict(X_train)

In [10]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

        AUTO       1.00      1.00      1.00        87
   SENTENCIA       1.00      1.00      1.00        18

    accuracy                           1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



In [11]:
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
print(f'Matriz de Confusión \n\tTP:{tp:{3}} \tFP:{fp:{3}} \n\tTN:{tn:{3}} \tFN:{fn:{3}}')

Matriz de Confusión 
	TP: 18 	FP:  0 
	TN: 87 	FN:  0


Valido el modelo

In [12]:
y_test_pred = model.predict(X_test)

In [13]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

        AUTO       0.95      1.00      0.97        38
   SENTENCIA       1.00      0.71      0.83         7

    accuracy                           0.96        45
   macro avg       0.97      0.86      0.90        45
weighted avg       0.96      0.96      0.95        45



In [14]:
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
print(f'Matriz de Confusión \n\tTP:{tp:{3}} \tFP:{fp:{3}} \n\tTN:{tn:{3}} \tFN:{fn:{3}}')    

Matriz de Confusión 
	TP:  5 	FP:  0 
	TN: 38 	FN:  2


Ahora quiero predecir un documento nuevo... por ejemplo tomo el primer documento 

In [15]:
dataset[0:1].TIPO, dataset[0:1].TEXTO

(0    AUTO
 Name: TIPO, dtype: object,
 0    sala electoral comp.originaria tribunal superi...
 Name: TEXTO, dtype: object)

El texto ya vectorizado anteriormente es el siguiente:

In [18]:
X_train[0:1]

<1x19224 sparse matrix of type '<class 'numpy.float64'>'
	with 1148 stored elements in Compressed Sparse Row format>

Si predigo ese documento:

In [17]:
model.predict(X_train[0:1])

array(['AUTO'], dtype=object)

Predice correctamente.

Ahora bien, que pasa si quiero predecir un documento que no está en los ya vectorizados. Tengo que primero vectorizarlo para poder aplicarle el modelo de clasificación.

In [19]:
vectorizer.transform(dataset[0:1].TEXTO)

<1x19224 sparse matrix of type '<class 'numpy.float64'>'
	with 767 stored elements in Compressed Sparse Row format>

In [20]:
model.predict(vectorizer.transform(dataset[0:1].TEXTO))

array(['AUTO'], dtype=object)