In [1]:
# Librerias

import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


#python -m nltk.downloader all
from nltk.tokenize import word_tokenize
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

**Cargar los datos y separarlos**

In [3]:
file = pd.read_excel('Train_textosODS.xlsx')
file.head(5)

Unnamed: 0,textos,ODS
0,"""Aprendizaje"" y ""educación"" se consideran sinó...",4
1,No dejar clara la naturaleza de estos riesgos ...,6
2,"Como resultado, un mayor y mejorado acceso al ...",13
3,Con el Congreso firmemente en control de la ju...,16
4,"Luego, dos secciones finales analizan las impl...",5


In [7]:
print(f'Duplicados: {file.duplicated().sum()}')
print(f'Nulos: {file.isna().sum()}')

Duplicados: 0
Nulos: textos    0
ODS       0
dtype: int64


In [9]:
ODS = file['ODS']
textos = file['textos']

**Clase para procesar/preparar el texto**

In [42]:
class TextProcessor(BaseEstimator, TransformerMixin):
    '''
    A text processing class for text data. This transformer handles tokenization, 
    stop word removal, and stemming (optional), and transforms text data into 
    a bag-of-words representation.
    '''
    def __init__(self, token_cue = r'\w+', language = 'spanish', stemming_switch = True):
        self.language = language
        self.token_cue = token_cue
        self.stop_words_list = stopwords.words(self.language)
        self.tokenizer = RegexpTokenizer(self.token_cue)
        self.stemming_switch = stemming_switch
        self.stemmer = SnowballStemmer(self.language)
        self.vectorizer = TfidfVectorizer()       
        
    def fit(self, text, y = None):
        self.vectorizer.fit(text)
        return self

    def tokenize(self,text):
        return text.apply(lambda x: self.tokenizer.tokenize(x))
        
    def remove_stop_words(self,text):
        return text.apply(lambda x: [token for token in x if token not in self.stop_words_list])
        	
    def stemming(self,text):
        #return text.apply(lambda x: [self.stemmer.stem(token) for token in x])
        return text.apply(lambda x: [str(self.stemmer.stem(token)) for token in x])

    def BOW_transform(self,stems):  
        return self.vectorizer.fit_transform(stems.apply(lambda x: ' '.join(x)))

    def transform(self, text, y=None):
        if self.stemming_switch:
            processed_text = self.stemming(self.remove_stop_words(self.tokenize(text)))
        else:
            processed_text = self.remove_stop_words(self.tokenize(text))

        # Usar el vectorizador ajustado para transformar los textos procesados
        return self.vectorizer.transform([' '.join(tokens) for tokens in processed_text])


**Definir el Pipeline**

In [45]:
tsvd = TruncatedSVD(n_components=100)
logreg = LogisticRegression(max_iter=500)
pipeline = Pipeline([
    ('text_processor', TextProcessor()),
    ('tsvd', TruncatedSVD(n_components=100)),
    ('logreg', LogisticRegression(max_iter=500))
])

entre 1


**Entrenar el modelo**

In [48]:
# Separar los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(textos, ODS, test_size = 0.2, random_state=0)
# Fit el pipeline
pipeline.fit(X_train,y_train)

entre 2
entre 7
entre 3
entre 4
entre 5


**Resultados**

In [51]:
print(X_train.shape)
print(X_test.shape)
# predecir los datos de prueba
y_pred = pipeline.predict(X_test)
# sacar metricas
print(classification_report(y_test, y_pred))

(7724,)
(1932,)
entre 7
entre 3
entre 4
entre 5
              precision    recall  f1-score   support

           1       0.36      0.49      0.41        87
           2       0.22      0.09      0.13        64
           3       0.78      0.71      0.74       184
           4       0.47      0.81      0.60       209
           5       0.85      0.78      0.82       189
           6       0.34      0.43      0.38       147
           7       0.57      0.65      0.61       183
           8       0.44      0.22      0.30        94
           9       0.52      0.18      0.27        77
          10       0.43      0.18      0.26        66
          11       0.61      0.57      0.59       117
          12       0.64      0.10      0.17        72
          13       0.28      0.21      0.24        91
          14       0.63      0.45      0.53        73
          15       0.78      0.54      0.64        70
          16       0.61      0.87      0.72       209

    accuracy                    

In [53]:
confusion_matrix(y_test, y_pred)

array([[ 43,   1,   6,  11,   1,   3,   3,   2,   0,   1,   2,   0,   3,
          1,   1,   9],
       [ 11,   6,   3,  12,   1,   8,  10,   1,   0,   1,   2,   0,   4,
          0,   1,   4],
       [  3,   0, 131,  13,   8,   4,   3,   3,   0,   3,   2,   0,   2,
          1,   0,  11],
       [  5,   2,   3, 169,   0,   4,   3,   4,   0,   0,   4,   0,   5,
          0,   1,   9],
       [  6,   0,   3,   4, 148,   4,   0,   2,   1,   1,   1,   0,   3,
          1,   0,  15],
       [  6,   4,   6,  15,   0,  63,  14,   1,   1,   4,  10,   0,   8,
          4,   2,   9],
       [  3,   0,   3,  20,   0,  16, 119,   1,   2,   2,   5,   0,   6,
          2,   0,   4],
       [  6,   2,   3,  20,  12,   6,   8,  21,   3,   1,   2,   0,   2,
          1,   0,   7],
       [  1,   3,   1,  19,   0,   5,   9,   4,  14,   1,   6,   0,   3,
          3,   0,   8],
       [ 15,   2,   1,  17,   0,   2,   1,   4,   1,  12,   2,   0,   3,
          1,   0,   5],
       [  6,   0,   3,  10,   

**Celdas de Prueba**

In [None]:
# tokens = procesador.tokenize(X_train[0:20])
# no_stop = procesador.remove_stop_words(tokens)
# stems = procesador.stemming(no_stop)
# BOW = procesador.BOW_transform(stems)
# BOW = procesador.BOW_transform(stems)
# BOW[0]

In [None]:
# svd = TruncatedSVD(n_components=100)
# BOW_reduced = svd.fit_transform(BOW)

In [None]:
# reg = LogisticRegression(max_iter=500)
# reg.fit(BOW_reduced,y_train[:20])