In [1]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.9.1-py3-none-any.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 15.1 MB/s 
[?25hCollecting markdown-it-py[linkify,plugins]
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.5 MB/s 
Collecting fastapi
  Downloading fastapi-0.87.0-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.1 MB/s 
Collecting uvicorn
  Downloading uvicorn-0.19.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 5.1 MB/s 
[?25hCollecting h11<0.13,>=0.11
  Downloading h11-0.12.0-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 1.3 MB/s 
[?25hCollecting pycryptodome
  Downloading pycryptodome-3.15.0-cp35-abi3-manylinux2010_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 44.3 MB/s 
[?25hCollecting python-

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import json
from torch.utils.data import Dataset
import numpy as np

#Para gráficos
import seaborn as sns
import matplotlib.pyplot as plt

# Train and dataset
from sklearn.model_selection import train_test_split
from torch.utils.data.dataset import random_split

# Para tokenizacion
#importar libreria nltk
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
misstop = stopwords.words('spanish')

from nltk.tokenize import word_tokenize
nltk.download("punkt")

# para vocabulario
import re
from collections import Counter, OrderedDict

# para indexar tokens 
from torchtext.vocab import vocab

## transformación pre modelo 
import torch
import torch.nn as nn

#data loader

from torch.utils.data import DataLoader

# My Drive connection
from google.colab import drive
drive.mount('/content/drive')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


In [3]:
import gradio as gr

# Organizacion para generar clasificador 

In [4]:
class RNN_GRU_BI(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.GRU(embed_dim, rnn_hidden_size, 
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
 

In [5]:
path = '/content/drive/MyDrive/javeriana/nlp/final_project/model_gru_ex_1.pt'

In [6]:
model = torch.load(path)
model.eval()

RNN_GRU_BI(
  (embedding): Embedding(4221, 10, padding_idx=0)
  (rnn): GRU(10, 10, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=20, out_features=10, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=10, out_features=5, bias=True)
  (sigmoid): Sigmoid()
)

In [7]:
class TextData(Dataset):
    '''
    Dataset basico para leer los datos de tweets
    '''
    def __init__(self, filename):
        super(TextData, self).__init__()
        df = pd.read_csv(filename,encoding='utf-8', sep='|')
        self.df = df[["evaluation", "text"]]
        
    def __getitem__(self, index):
        return self.df.iloc[index,0], self.df.iloc[index,1]
    
    def __len__(self):
        return len(self.df)

In [8]:
path = '/content/drive/MyDrive/javeriana/nlp/final_project/clean_reviews.csv'
df = TextData(path)
train_dataset, test_dataset = random_split(df,
 [int(len(df)*0.7),len(df) - int(len(df)*0.7)], torch.manual_seed(1))

train_dataset, valid_dataset = random_split(
    list(train_dataset), [int(len(train_dataset)*0.8),len(train_dataset)-int(len(train_dataset)*0.8)], torch.manual_seed(1))

In [9]:
#funcion de limpieza de tweets

def tokenizacion(text):
  def limpieza(text):
    text=re.sub(r'@_[A-Za-z0-9]+_','',text) #Remover @_menciones_
    text=re.sub(r'@[A-Za-z0-9]+_[A-Za-z0-9]+','',text) #Remover @menciones_
    text=re.sub(r'@[A-Za-z0-9]+','',text) #Remover @menciones
    text=re.sub(r'https?:\/\/\S+','',text) #Remover Hypervinculos
    text=re.sub(r'RT[\s]+','',text) #Remover Retweets
    text=re.sub(r'[^\w\s]',"",text) #remover signos de puntuacion
    text=re.sub(r'#[A-Za-z0-9]+','',text) #Remover "#"
    text=re.sub(r'[0-9]','',text) #remover numeros
    text=re.sub(r'[^\w\s]',"",text) #remover signos de puntuacion
    text=re.sub(r'\n|\t',' ',text) #remover saltos de linea
    text=re.sub(r'\@',' ',text) #remover "@"
    text=re.sub(r'[\s]+',' ',text) #reemplazar espacios dobles por espacion sencillos
    text=re.sub(r"^[\s]",' ',text) #eliminar espacios al inicio de cada tweet
    text=text.lower() #todo a minuscula
    return text
  
  text2=limpieza(text)

  #tokenizar tweet eliminando stopwords
  token1=word_tokenize(text2)
  token2=[]
  for word in token1:
    if word not in misstop: 
      token2.append(word)
  return token2

In [10]:
# Creación del vocabulario
token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizacion(line)
    token_counts.update(tokens)


print('Vocab-size:', len(token_counts))

# indexando tokens
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab_ = vocab(ordered_dict)

vocab_.insert_token("", 0)
vocab_.set_default_index(1)

print([vocab_[token] for token in ['manejo', 'literatura', 'utliza', 'arte']])

Vocab-size: 4220
[671, 121, 1, 135]


In [11]:
# Global Variable
seed_ = 12345
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [12]:
text_pipeline = lambda x: [vocab_[token] for token in tokenizacion(x)]
label_pipeline = lambda x: x +2 

#Definir funcion de trnasformacion y codificacion
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [13]:
### FUNCION PREDICTORA 

def predict_(ex_text_str):
  text= [(0, ex_text_str)]
  new_dl = DataLoader(text, batch_size=1,
                      shuffle=True, collate_fn=collate_batch)
  text_batch, label_batch, lengths= next(iter(new_dl))
  y_pred = torch.argmax(model(text_batch, lengths), 1)
  y_pred = int(y_pred)-2
  return f'En una escala de -2 a 2, con -2 odio, 2 positivo, el resultado es {y_pred}'
  



In [14]:
demo = gr.Interface(fn=predict_, inputs="text", outputs='text')

demo.launch()   

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`

Using Embedded Colab Mode (NEW). If you have issues, please use share=True and file an issue at https://github.com/gradio-app/gradio/
Note: opening the browser inspector may crash Embedded Colab Mode.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

(<gradio.routes.App at 0x7f990de2c950>, 'http://127.0.0.1:7860/', None)

# Comentarios para proba

In [16]:
valid_dataset.dataset

[(-2,
  'El artículo describe algunos aspectos generales de la aplicación de minería de datos a redes sociales.  La revisión de información no es completa y dista mucho de ser considerada como el estado del arte del tema.  Sugiero primero revisar artículos de revistas indexadas en el tema, identificar un problema de investigación concreto, donde se pueda realizar un aporte en el tema, y plantear una propuesta de investigación.'),
 (-1,
  '• Los títulos no van con “.” • Resumen   o“El objetivo de este documento es intentar recolectar…” --> sugiero quitar la palabra intentar •Abstract   oRevisar la gramática ya que se encuentran falta de artículos y algunas frases mal redactadas por ejemplo:    -\tThe images emerging technologies -->  the images technology emerging    -\tsuch as machine vision systems and scanning --> such as the machine vision systems and the scanning…    -\t…The objective of this paper is to try to collect… --> The objective of this paper is to try to collect    -\t th