#Cuaderno de trabajo: Procesamiento de texto

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
pd.options.display.max_colwidth = 200
%matplotlib inline

Comencemos definiendo nuestro corpus documental. En este caso, respuestas a la pergunta "¿Qué hiciste durante el puente?"

In [4]:
corpus = ["Fui a Cajicá a almorzar con un amigo", \
          "Fui a Radio Berlin a bailar", \
          "Jugué tennis", \
          "Salí a bailar con amigos", \
          "Trabajar en tesis y trabajar en el curso de Inteligencia de Negocios",\
          "Lavé la ropa e hice mercado", \
          "Lavé la ropa y estuve con mi hijo", \
          "Pinté mi cuarto"]

corpus = np.array(corpus)



Vamos a proceder con un procesamiento sencillo del corpus para eliminar espacios, caracteres especiales, números, stopwords, y stemmizar/lematizar mis documentos

In [5]:
from nltk.stem import PorterStemmer


In [6]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
wpt = nltk.WordPunctTokenizer()
ps = PorterStemmer()

In [8]:
def normalize_documents(doc):
  doc = re.sub(r'[^a-zA-Z\s]','',doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip()
  tokens = wpt.tokenize(doc)
  filtered_token = [ps.stem(token) for token in tokens if token not in stop_words]
  doc = ' '.join(filtered_token)
  return doc

In [9]:
normalize_corpus = np.vectorize(normalize_documents)

In [10]:
norm_corpus = normalize_corpus(corpus)

In [11]:
norm_corpus

array(['cajic almorzar amigo', 'radio berlin bailar', 'jugu tenni',
       'sal bailar amigo',
       'trabajar tesi trabajar curso inteligencia negocio',
       'lav ropa hice mercado', 'lav ropa hijo', 'pint cuarto'],
      dtype='<U49')

#Representación Bag of Words


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.0, max_df=1.0)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix

<8x21 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [13]:
vocab = cv.get_feature_names_out()

In [14]:
print(cv_matrix)

  (0, 4)	1
  (0, 0)	1
  (0, 1)	1
  (1, 15)	1
  (1, 3)	1
  (1, 2)	1
  (2, 10)	1
  (2, 18)	1
  (3, 1)	1
  (3, 2)	1
  (3, 17)	1
  (4, 20)	2
  (4, 19)	1
  (4, 6)	1
  (4, 9)	1
  (4, 13)	1
  (5, 11)	1
  (5, 16)	1
  (5, 7)	1
  (5, 12)	1
  (6, 11)	1
  (6, 16)	1
  (6, 8)	1
  (7, 14)	1
  (7, 5)	1


In [15]:
pd.DataFrame(cv_matrix.toarray(), columns=vocab)

Unnamed: 0,almorzar,amigo,bailar,berlin,cajic,cuarto,curso,hice,hijo,inteligencia,...,lav,mercado,negocio,pint,radio,ropa,sal,tenni,tesi,trabajar
0,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,1,2
5,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#Representación N-Grams




In [16]:
from sklearn.feature_extraction.text import CountVectorizer

bv = CountVectorizer(ngram_range=(2,3))
bv_matrix = bv.fit_transform(norm_corpus)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names_out()
pd.DataFrame(bv_matrix, columns=vocab)

Unnamed: 0,almorzar amigo,bailar amigo,berlin bailar,cajic almorzar,cajic almorzar amigo,curso inteligencia,curso inteligencia negocio,hice mercado,inteligencia negocio,jugu tenni,...,ropa hice mercado,ropa hijo,sal bailar,sal bailar amigo,tesi trabajar,tesi trabajar curso,trabajar curso,trabajar curso inteligencia,trabajar tesi,trabajar tesi trabajar
0,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,1,0,...,0,0,0,0,1,1,1,1,1,1
5,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Representación TF-idF

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

tt = TfidfTransformer(norm='l2', use_idf=True)
tt_matrix = tt.fit_transform(cv_matrix)
tt_matrix = tt_matrix.toarray()
vocab = cv.get_feature_names_out()
pd.DataFrame(tt_matrix, columns=vocab)

Unnamed: 0,almorzar,amigo,bailar,berlin,cajic,cuarto,curso,hice,hijo,inteligencia,...,lav,mercado,negocio,pint,radio,ropa,sal,tenni,tesi,trabajar
0,0.608313,0.509814,0.0,0.0,0.608313,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.509814,0.608313,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.608313,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0
3,0.0,0.540443,0.540443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.644859,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.353553,...,0.0,0.0,0.353553,0.0,0.0,0.0,0.0,0.0,0.353553,0.707107
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541948,0.0,0.0,...,0.454195,0.541948,0.0,0.0,0.0,0.454195,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.644859,0.0,...,0.540443,0.0,0.0,0.0,0.0,0.540443,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0


# Clasificación por tópicos

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 4, max_iter = 10000, random_state=0)
dt_matrix = lda.fit_transform(cv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2', 'T3', 'T4'])

In [19]:
tt_matrix = lda.components_
for topic_weights in tt_matrix:
  topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
  topic = sorted(topic, key=lambda x: -x[1])
  topic = [item for item in topic if item[1]>0.6]
  print(topic)

[('amigo', 2.2495973130324085), ('bailar', 1.2515878937668807), ('almorzar', 1.2495557645269024), ('cajic', 1.2495557645269024), ('sal', 1.249547678133768), ('cuarto', 1.2493040842461312), ('pint', 1.2493040842461312)]
[]
[('trabajar', 2.24980879971218), ('curso', 1.2497871970981298), ('inteligencia', 1.2497871970981298), ('negocio', 1.2497871970981298), ('tesi', 1.2497871970981298), ('jugu', 1.249303980618064), ('tenni', 1.249303980618064)]
[('lav', 2.2495879804602734), ('ropa', 2.2495879804602734), ('hice', 1.2496113596980774), ('mercado', 1.2496113596980774), ('hijo', 1.249471225951912), ('berlin', 1.2494537487233317), ('radio', 1.2494537487233317), ('bailar', 1.2476332490961637)]
