In [2]:
import pandas as pd
import numpy as np
import re
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#### Leer y procesar archivo de texto

In [3]:
df = pd.read_excel("../80s.xlsx")

FileNotFoundError: [Errno 2] No such file or directory: '../80s.xlsx'

In [None]:
df.head()

In [None]:
dataset = df[df['EVALUACION']!=3].reset_index(drop=True)
dataset['EVALUACION'][dataset["EVALUACION"]<=2] = 0
dataset['EVALUACION'][dataset["EVALUACION"]>=4] = 1

In [None]:
dataset['EVALUACION'].unique() #ok

##### Limpieza basica de comentarios

In [None]:
def limpieza(texto):
    texto = texto.lower()
    tokens = re.findall(r'[a-zA-Z]+', texto)
    out = [w for w in tokens if len(w)>=3]
    return " ".join(out)

In [None]:
dataset['Comentariomin'] = dataset['Comentariomin'].apply(limpieza)

In [None]:
dataset.head()

In [None]:
corpus = dataset['Comentariomin'].values.tolist()

### Vectorización

In [None]:
vect = TfidfVectorizer(max_features=100)
FV = vect.fit_transform(corpus).toarray()

In [None]:
FV.shape

In [None]:
dataset['FV_TF-idf'] = FV.tolist() #to list solo para visualizacion
dataset.head()

In [None]:
vectTF = CountVectorizer(max_features=100)
FVTf = vect.fit_transform(corpus).toarray()

In [None]:
dataset['FV_TF'] = FVTf.tolist() #to list solo para visualizacion
dataset.head()

In [None]:
import gensim
from gensim.models import word2vec as w2v
from multiprocessing import cpu_count
from gensim import matutils

In [None]:
with open("corpus.txt", "w", encoding='utf-8') as file:
    for topic in corpus:
        file.write(topic+'\n')

In [None]:
#hyperparams
VECTOR_SIZE = 100
workers = cpu_count()-4
epochs = 10
window_size = 5

In [None]:
corpus_data = w2v.LineSentence("corpus.txt")
model = w2v.Word2Vec(corpus_data, vector_size=VECTOR_SIZE, min_count=1, workers=workers, epochs=epochs, window=window_size)

In [None]:
#Ejemplo de procesamiento NLP con contexto
model.wv.most_similar("rapido")

In [None]:
def document_to_vector(topic, model):
    words = topic.split()    
    document = list()
    for w in words:
        try:
            document.append(model.wv[w])
        except:
            document.append(np.zeros(100,))
    if len(words) == 0:
        document = np.zeros((1,100))
    return np.mean(np.array(document),axis=0)

In [None]:
dataset['w2v'] = (dataset['Comentariomin'].apply(lambda x: document_to_vector(x, model))).tolist()

In [None]:
dataset.columns

### Evaluación de Modelos

### Modelo LSTM con attention

In [529]:
dataset = dataset.sample(frac=1).reset_index(drop=True)

In [530]:
dataset = dataset[dataset['Comentariomin']!=''] #Filter out empty docs

In [531]:
test_size = math.ceil(dataset.shape[0]/10)

In [532]:
test_data = dataset[:test_size].reset_index(drop=True)
train_data = dataset[test_size:].reset_index(drop=True)

In [533]:
train_data.head()

Unnamed: 0,EVALUACION,Comentariomin,FV_TF-idf,FV_TF,w2v
0,1,por atencion,"[0.0, 0.0, 0.0, 0.6164846047093092, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.6164846047093092, 0.0, 0.0, ...","[-0.71931493, 0.38060495, 0.06382955, 0.001248..."
1,1,por rapidez atencion,"[0.0, 0.0, 0.0, 0.3616011214294089, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.3616011214294089, 0.0, 0.0, ...","[-0.8334515, 0.50286883, 0.004782031, -0.07957..."
2,1,atencion fue oportuna,"[0.0, 0.0, 0.0, 0.5486859751731602, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.5486859751731602, 0.0, 0.0, ...","[-0.7373402, 0.29470503, -0.082568355, 0.10921..."
3,1,cumplio con que esperaba,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.22858247, -0.07508833, 0.0916019, 0.012127..."
4,1,por que brindaron una muy buena atencion sobre...,"[0.0, 0.0, 0.0, 0.20317396030915646, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.20317396030915646, 0.0, 0.0,...","[-0.54640853, 0.08720608, 0.08424382, 0.100999..."


In [534]:
#sort train data by length of documents
#small batch size

s = train_data.Comentariomin.str.len().sort_values().index
train_data  = train_data.reindex(s).reset_index(drop=True)

In [535]:
train_data

Unnamed: 0,EVALUACION,Comentariomin,FV_TF-idf,FV_TF,w2v
0,1,qqq,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0045857383, 0.009256487, -0.005697112, -0...."
1,1,xxx,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.005458796, 0.009516866, -0.008407385, -0.00..."
2,1,todo,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.66196555, 0.07995832, -0.22037236, 0.06317..."
3,1,dddd,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0016660905, 0.0054979753, -0.006782217, -0...."
4,1,porq,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.27370843, 0.057843916, 0.007948949, 0.0933..."
...,...,...,...,...,...
8270,0,porque fui cerrar una cuenta ahorros previo el...,"[0.19320125849157913, 0.0, 0.05746333064579146...","[0.19320125849157913, 0.0, 0.05746333064579146...","[-0.30018118, 0.27072042, 0.08980753, 0.092213..."
8271,0,bueno ante todo ese dia estaba muy apurada que...,"[0.0, 0.0, 0.0, 0.023906675632376355, 0.0, 0.0...","[0.0, 0.0, 0.0, 0.023906675632376355, 0.0, 0.0...","[-0.27259967, 0.019342648, 0.0852872, 0.128235..."
8272,0,bueno antes era inclusive peor porque encargad...,"[0.0, 0.0, 0.0, 0.0, 0.07236400279931039, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.07236400279931039, 0.0,...","[-0.25830707, 0.13551225, 0.06709037, 0.122485..."
8273,0,hice uso del servicio ventanilla para realizar...,"[0.24417321972259404, 0.0, 0.0, 0.027609959849...","[0.24417321972259404, 0.0, 0.0, 0.027609959849...","[-0.3063431, 0.2536616, 0.07838951, 0.13197464..."


In [536]:
def load_batch(train_data, starting_idx, batch_size, model):
    documents = train_data["Comentariomin"].tolist()
    documents = documents[starting_idx:min(starting_idx+batch_size,len(documents))]
    
    targets = train_data["EVALUACION"].tolist()
    targets = torch.tensor(targets[starting_idx:min(starting_idx+batch_size,len(targets))])
    
    batch_data = []
    seq_len = []
    for d in documents:
        doc = []
        for w in d.split():
            try:
                doc.append(model.wv[w])
            except:
                doc.append(np.zeros(VECTOR_SIZE))
                
        
        if len(d) == 0:
            doc = np.zeros((1,100))
            seq_len.append(1)
        else:
            seq_len.append(len(d.split()))
            
        batch_data.append(torch.tensor(doc))
        
        
    return batch_data, seq_len, max(seq_len), targets

### Prueba de carga de batch a LSTM

In [537]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [538]:
batch_size = 8

In [539]:
batch, seq_len, max_seq, batch_y = load_batch(train_data, 0, batch_size,model)

In [540]:
batch[0].shape

torch.Size([1, 100])

In [541]:
train_data['Comentariomin'][0:batch_size].tolist(), batch_y

(['qqq', 'xxx', 'todo', 'dddd', 'porq', 'todo', 'bien', 'bien'],
 tensor([1, 1, 1, 1, 1, 1, 1, 1]))

In [542]:
def get_accuracy(logits, target):
        probs = torch.softmax(logits,dim=1)
        predictions = torch.argmax(probs, dim=1)
        return (torch.sum(predictions==target).item())/float(target.size()[0])

In [548]:
class AttentionRNN(nn.Module):
    def __init__(self, hidden, num_classes):
        super().__init__()        
        
        self.lstm = nn.LSTM(input_size=VECTOR_SIZE, hidden_size=hidden, batch_first=True, bidirectional=True)
        
        self.fc1 = nn.Linear(2*hidden, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, 1024)
        
        #Attention weights        
        self.att_W = nn.Linear(1024, 6*hidden, bias = False)
        self.att_V = nn.Linear(1024, 6*hidden, bias = False)
        
        
        self.fc4 = nn.Linear(1024,512)
        self.fc5 = nn.Linear(512,256)
        self.drop = nn.Dropout(0.2)
        
        self.out = nn.Linear(256, num_classes)
        
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=3e-4)

    def forward(self, x, max_seq):
        output, _ = self.lstm(x.float())
        
        output, output_lens = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True, total_length=max_seq)
        #bs, t, h
        
        output = F.leaky_relu(self.fc1(output))
        output = F.relu(self.fc2(output)) 
        output = F.leaky_relu(self.fc3(output)) #bs, t, 1024
        
        #Self Attention
        w_matrix = self.att_W(output) #bs, t , h'
        v_matrix = self.att_V(output) #bs, t , h'
        att_weights =  F.softmax(torch.bmm(w_matrix,v_matrix.transpose(1,2)),dim=-1) #bs,t,t
        output = torch.bmm(att_weights,output) #bs, t, h
        
        output = F.leaky_relu(self.fc4(output)) 
        output = F.leaky_relu(self.fc5(output)) 
        
        output = output.mean(dim=1)

        x = self.out(output)
        return x
    
    def train_step(self, batch_x, max_seq, batch_y):
        
        #Forward
        logits = self.forward(batch_x, max_seq)
        
        #loss
        self.optimizer.zero_grad()
        loss = self.criterion(logits,batch_y)
        loss.backward()
        self.optimizer.step()
        
        batch_loss = loss.to('cpu').item()
        batch_accuracy = get_accuracy(logits, batch_y)
        
        return batch_loss, batch_accuracy
    
    def save_model(self, path):
        torch.save(self.state_dict(), path)
        
    def load_model(self, path):
        self.load_state_dict(torch.load(path))

In [549]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [550]:
BATCH_SIZE = 256
NUMBER_OF_BATCHES = math.ceil(train_data.shape[0]/BATCH_SIZE) # Nro de batches
EPOCHS = 40
n_unique_categories = 2

In [551]:
attention_rnn = AttentionRNN(256,2).to(device)

In [552]:
max_train_accuracy = 0

current_loss = 0
current_accuracy = 0

for e in range(EPOCHS):
      
    for i in range(NUMBER_OF_BATCHES):
        torch.cuda.empty_cache()
        batch, seq_len, max_seq, batch_y = load_batch(train_data, i*BATCH_SIZE, BATCH_SIZE,model)
        
        #padding
        padded_seq_batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
        packed_seq_batch = torch.nn.utils.rnn.pack_padded_sequence(padded_seq_batch, lengths=seq_len, batch_first=True, enforce_sorted=False)
        
        loss, accuracy = attention_rnn.train_step(packed_seq_batch.to(device), max_seq, batch_y.to(device))
        #print(loss,accuracy)
        current_loss+=loss
        current_accuracy+=accuracy
    print(f'Epoch:  {e+1} | Train Loss: {(current_loss/NUMBER_OF_BATCHES):.6f} | Train Accuracy: {(current_accuracy/NUMBER_OF_BATCHES)*100:.2f} %')
    
    current_loss = 0
    current_accuracy = 0
    
#attention_rnn.save_model('model_parameters')

Epoch:  1 | Train Loss: 0.602374 | Train Accuracy: 62.94 %
Epoch:  2 | Train Loss: 0.506042 | Train Accuracy: 82.21 %
Epoch:  3 | Train Loss: 0.412366 | Train Accuracy: 84.56 %
Epoch:  4 | Train Loss: 0.369041 | Train Accuracy: 85.22 %
Epoch:  5 | Train Loss: 0.340861 | Train Accuracy: 85.69 %
Epoch:  6 | Train Loss: 0.345535 | Train Accuracy: 84.63 %
Epoch:  7 | Train Loss: 0.389221 | Train Accuracy: 82.46 %
Epoch:  8 | Train Loss: 0.363779 | Train Accuracy: 84.37 %
Epoch:  9 | Train Loss: 0.324092 | Train Accuracy: 85.65 %
Epoch:  10 | Train Loss: 0.306255 | Train Accuracy: 86.58 %
Epoch:  11 | Train Loss: 0.295091 | Train Accuracy: 86.71 %
Epoch:  12 | Train Loss: 0.286421 | Train Accuracy: 87.22 %
Epoch:  13 | Train Loss: 0.294897 | Train Accuracy: 87.25 %
Epoch:  14 | Train Loss: 0.281588 | Train Accuracy: 87.47 %
Epoch:  15 | Train Loss: 0.280955 | Train Accuracy: 87.64 %
Epoch:  16 | Train Loss: 0.290688 | Train Accuracy: 87.14 %
Epoch:  17 | Train Loss: 0.275751 | Train Accurac

### Evaluation

In [553]:
attention_rnn.load_model("model_parameters")

In [554]:
batch, seq_len, max_seq, batch_y = load_batch(test_data, 0, test_data.shape[0], model)

In [555]:
attention_rnn.to('cpu')

AttentionRNN(
  (lstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=1024, bias=True)
  (att_W): Linear(in_features=1024, out_features=1536, bias=False)
  (att_V): Linear(in_features=1024, out_features=1536, bias=False)
  (fc4): Linear(in_features=1024, out_features=512, bias=True)
  (fc5): Linear(in_features=512, out_features=256, bias=True)
  (drop): Dropout(p=0.2, inplace=False)
  (out): Linear(in_features=256, out_features=2, bias=True)
  (criterion): CrossEntropyLoss()
)

In [556]:
#padding
padded_seq_batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
packed_seq_batch = torch.nn.utils.rnn.pack_padded_sequence(padded_seq_batch, lengths=seq_len, batch_first=True, enforce_sorted=False)

with torch.no_grad():
    logits =  attention_rnn.forward(packed_seq_batch, max_seq)

In [557]:
#Test accuracy
get_accuracy(logits, batch_y)

0.95

In [558]:
def get_predictions(logits):
    probs = torch.softmax(logits,dim=1)
    predictions = torch.argmax(probs, dim=1).numpy()
    return predictions

In [559]:
from sklearn.metrics import accuracy_score, classification_report

In [560]:
print(classification_report((get_predictions(logits)), batch_y.numpy()))

              precision    recall  f1-score   support

           0       0.88      0.93      0.91       236
           1       0.98      0.96      0.97       684

    accuracy                           0.95       920
   macro avg       0.93      0.94      0.94       920
weighted avg       0.95      0.95      0.95       920

