In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import openpyxl
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [8]:
path = '../data/Consulta_JMA.xlsx'

df = pd.read_excel(path)
df.head()

Unnamed: 0,CodigoTarea,IDEmail,Cuerpo,FechaCreacion,Categoria,Idioma
0,1094561,1379091,"Buenos días Manuel,_x000d_\n_x000d_\nPasarme ...",2023-06-02,OMOD,E
1,1194837,1391986,"Hola,_x000d_\n_x000d_\n¿Espero que estés bien?...",2024-02-16,I,E
2,1225091,1391986,"Hola,_x000d_\n_x000d_\n¿Espero que estés bien?...",2024-04-25,I,E
3,1034918,1464101,"Buenos días,_x000d_\n_x000d_\n _x000d_\n_x000d...",2023-01-01,OMOD,E
4,1034932,1464115,Adjunto les enviamos PEDIDOS (PDF)_x000d_\n_x0...,2023-01-02,PMOD,E


In [9]:
df['Categoria'] = df['Categoria'].fillna("sin categoría")
df['Cuerpo'] = df['Cuerpo'].fillna("sin cuerpo")

# Agrupar por CodigoTarea y unir los correos electrónicos en una sola entrada
df = df.groupby(by='CodigoTarea').agg({
    'IDEmail': 'first',
    'Cuerpo': 'first',
    'FechaCreacion': 'first',
    'Categoria': 'first',
    'Idioma': 'first'
}).reset_index()

# df['Categoria'] = df['Categoria'].replace({'PREP': 'P', 'PMOD': 'P', 'OMOD' : 'O', 'OREP' : 'O', 'SAT' : 'I'})

# Asegúrate de descargar las stopwords
nltk.download('stopwords')

# Función para limpiar el texto
def clean_text(text):
    # Eliminar 'xd'
    text = re.sub(r'\bxd\b', '', text)

    # Eliminar caracteres no deseados (como 'x000d')
    text = re.sub(r'\bx000d\b', '', text)

    # Eliminar URLs y correos electrónicos
    text = re.sub(r'http\S+|www\S+|https\S+|mailto:\S+', '', text)

    # Convertir a minúsculas
    text = text.lower()

    # Eliminar caracteres no alfabéticos y espacios extra
    text = re.sub(r'[^a-záéíóúñü\s]', '', text)

    # Dividir el texto en palabras
    words = text.split()
    
    words = word_tokenize(text)

    # Filtrar stopwords
    spanish_stopwords = set(stopwords.words('spanish'))
    filtered_words = [word for word in words if word not in spanish_stopwords]
    

    return ' '.join(filtered_words)

# Supongamos que df_grouped ya está definido
# Aplica la función a cada fila de la columna 'Cuerpo'
df['Cuerpo'] = df['Cuerpo'].apply(clean_text)
df['Cuerpo'] = df['Cuerpo'].str.split().apply(lambda x: [word for word in x if word != 'xd'])


# Selección de las columnas relevantes
df = df[['Cuerpo', 'Categoria']]

# Codificación de etiquetas
label_encoders = {}
for column in ['Categoria']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

[nltk_data] Downloading package stopwords to /home/jose-
[nltk_data]     manuel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df.head()

Unnamed: 0,Cuerpo,Categoria
0,"[buenos, díasxd, solicito, precio, componente,...",1
1,"[adjunto, enviamos, pedidos, pdfxd, otis, spai...",3
2,"[peticion, oferta, imem, n, somgi, estimados, ...",1
3,"[proveedor, hidral, sa, dirección, carmendiezh...",3
4,"[bom, dia, sei, podes, ajudar, com, uma, duvid...",1


In [21]:
from gensim.models import Word2Vec
# define training data
sentences = df['Cuerpo']
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.key_to_index)
# access vector for one word
print(model.wv['oferta'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec<vocab=52533, vector_size=100, alpha=0.025>
[-0.70918477  1.7366476   1.3769567  -1.5275235  -3.7530987  -1.1250762
  1.0280809  -0.76175     0.94437337 -2.7023125   1.3878888  -0.5721025
 -1.6235687  -3.5281098  -3.1166475   0.8023179   0.01779636  1.4832716
 -0.9585906  -1.8566952  -2.5905366   0.41293246 -1.0643734   0.25002658
 -2.3564994   1.2244786   1.1763015  -0.1824914   2.4347992   0.07404983
  2.91333     4.187071    1.1667479   4.2646446  -1.0798124  -2.1421797
  4.4323783  -4.338553   -0.34522575 -0.43930784 -0.74427074 -3.296041
  2.778061   -2.0689127   0.99807626 -4.34527     3.6624355  -0.02799573
 -3.4973288   0.4645225  -1.9981389   0.7800911  -1.342496   -0.6330312
 -0.14618926  0.80533695  0.384414    1.6666781   1.15139     1.2063346
  2.6750088  -0.8523143  -0.75807846 -3.1338742   2.5507047  -1.7071701
 -0.6120548  -4.1701026  -1.4717503   0.22496778 -0.25541556  0.5648664
  2.1334996  -0.97845346 -2.8250895   2.2825236   2.443203    1.2997766
 -0.784933