# 1. Connect to the GPU

In [1]:
import torch
import tensorflow as tf

device = torch.device("cuda")
print(torch.cuda.get_device_name(0))

Tesla V100-SXM2-32GB


# 2. Import the training data set

In [2]:
# Import the dataset
import numpy as np
import pandas as pd

df = pd.read_csv('/PATH.csv')
df.rename(columns={'0': 'Opinion'}, inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Title,Opinion,Place,Gender,Age,Country,Date,Label
0,0,"""¡Momias, demasiado impresionante!""","""Las momias están en muy buen estado de conser...",Museo de las Momias,Male,53,México,22/10/2016,1
1,1,Comida cara,Tienen carteles con comida cortida de 40 y no ...,Mercado Hidalgo,N/I,-1,México,2018,1
2,2,"""No coman ahí""","""Creo que es muy insalubre, hay basura por tod...",Mercado Hidalgo,Female,61,México,15/01/2013,1
3,3,"""Momificado""","""Para mí gusto no vale la pena... tristemente ...",Museo de las Momias,Male,38,Colombia,11/05/2017,1
4,4,"""Incómodo y cero romántico""","""Es un lugar poco interesante y que se conoce ...",Callejón del Beso,Female,38,Francia,28/11/2017,1
...,...,...,...,...,...,...,...,...,...
5189,5192,"""Verdadera joya arquitectónica""","""Es una construcción majestuosa, creo que de l...",Teatro Juárez,Male,68,México,24/02/2017,5
5190,5193,"""Romántico""","""Muy al estilo de Romeo y Julieta es este siti...",Callejón del Beso,Male,41,Colombia,31/10/2015,5
5191,5194,"""Parece un castillo""","""Ideal para subir las escalinatas y divisar su...",Universidad de Guanajuato,Male,41,México,12/11/2016,5
5192,5195,"""Imperdible""","""Es imperdible, de ahí puedes ver muy bien la ...",Monumento Pípila,Male,46,Chile,19/05/2017,5


In [3]:
# Check the distribution of the labels in the data set
df['Label'].value_counts()

5    2688
4    1595
3     686
2     145
1      80
Name: Label, dtype: int64

# 3. BETO Model 

The following section performs these steps:
   
- 3.1  Data pre-processing for BETO (removal of quotes from the reviews in the dataframe).
    
- 3.2 Load the model (BETO) and the tokenizer into memory.

- 3.3 Turn the the reviews in the DataFrame into the tokens (imput ids and attention masks) that the model requires for training.

- 3.4 Create a DataLoader object for passing the input ids and the attention masks into the model.

- 3.5 Pass the tokens through the model.

- 3.6 Extract the embeddings created by the model, and save them into a list.

- 3.7 Create a Dataframe with these newly created embeddings.

## 3.1 Data pre-processing

In [4]:
# Remove the quotes in the text
def remove_quotes(dataframe):
    clean = []
    for op in dataframe['Opinion']:
        clean.append(op.strip('"'))
    dataframe['Opinion'] = clean

In [5]:
# Remove the quotes in the text inside the Dataframe
remove_quotes(df)

## 3.2 Load the model (BETO) and the tokenizer into memory

In [6]:
# Create the tokenizer and the model (BETO in this case)
from transformers import BertModel, BertTokenizer, AdamW

model = BertModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased',
                                  output_attentions = False, # The model won't return the attentions weights
                                  output_hidden_states = False)
model.cuda()

tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3.3 Tokenize the reviews

In [7]:
# Tokenize all of the sentences and map the tokens to their word IDs
opiniones = df['Opinion'].values

input_ids = []
attention_masks = []

for opinion in opiniones:
    encoded_dict = tokenizer.encode_plus(
                        opinion,                   # Sentence to encode
                        add_special_tokens = True, # Add the '[CLS]' and '[SEP]' tokens 
                        max_length = 512,           # Pad and truncate all sequences
                        truncation = True,        
                        padding='max_length',
                        return_attention_mask = True,  # Get the attention masks
                        return_tensors = 'pt',     # Return pytorch tensors
                        verbose = True # Print warnings
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

## 3.4 Create the DataLoader for feeding the tokens into the model

In [8]:
# Create a Dataloader for training the model with the tensors created above.
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(
            dataset,  
            sampler = SequentialSampler(dataset), #Select batches sequentially.
            batch_size = 1)

## 3.5 Pass the tokens into the model and save the output into a list

In [9]:
output_list = []

for step, batch in enumerate(dataloader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        outputs = model(input_ids, 
                        token_type_ids=None, 
                        attention_mask=attention_mask, 
                        return_dict=True)
        output_list.append(outputs.last_hidden_state[0].tolist())

### Check the dimensions of the output

In [10]:
print("The output has", len(output_list), "elements, each one corresponding to each sentence in the original dataframe.")
print("Each sequence has", len(output_list[0]), "tokens.")
print("Each sentence has a vector with", len(output_list[0][0]), "elements in it.")

The output has 5194 elements, each one corresponding to each sentence in the original dataframe.
Each sequence has 512 tokens.
Each sentence has a vector with 768 elements in it.


## 3.6 Create a list with the embeddings for each encoded sentence

In [11]:
embeddings = []

for i in range(len(output_list)):
    embeddings.append(output_list[i][0])

## 3.7 Create a dataframe with the embeddings extracted from BETO

In [12]:
df_BERT = pd.DataFrame(embeddings)
df_BERT

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.100722,-0.513502,-0.111594,0.448818,-0.466021,0.142489,0.228598,0.030338,-0.365344,-0.200068,...,0.003040,-1.268783,0.053861,0.087202,-0.412110,-0.297560,-0.321638,-1.161446,-0.630026,0.249530
1,0.537293,0.437717,0.259103,0.366199,0.062330,0.693132,0.476319,-0.106528,-0.266705,0.396034,...,0.104844,-0.971615,-0.640315,-0.127146,-0.314339,-0.237429,0.102838,-0.946722,-0.078620,0.115184
2,0.177042,0.283975,-0.306809,-0.246421,-0.075344,0.137682,0.173271,0.732921,-0.387721,-0.271484,...,0.418992,-0.690087,-0.420569,0.418654,-0.963163,0.151083,-0.037838,-0.847361,-0.593877,-0.005196
3,0.168886,-0.239445,-0.369098,0.286953,0.280967,0.066748,0.244342,-0.246206,-0.279504,-0.197400,...,0.081249,-0.327408,0.019463,-0.005029,-0.746119,0.328063,0.077538,-0.805399,-0.040993,0.118009
4,0.037849,-0.249693,-0.692661,0.079971,-0.354050,-0.024417,-0.464972,-0.335644,-0.578367,-0.438497,...,0.262042,-0.947012,0.070426,-0.004722,-0.774869,-0.035387,0.229826,-0.912594,-0.494465,0.863951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5189,0.427815,-0.066462,-0.182661,-0.182542,-0.367275,0.482871,0.571338,0.057494,-0.570252,-0.295871,...,0.107437,-1.077025,0.206952,0.386056,0.578838,-0.270711,0.273910,-0.757631,-0.633818,-0.407240
5190,0.342039,0.543797,0.486739,0.678449,0.032057,0.245350,0.167865,-0.125825,-0.457089,-0.376417,...,0.702219,-0.488916,-0.016850,-0.120353,-0.037926,-0.950023,0.374648,-0.276405,-0.234301,-0.643547
5191,0.867902,0.389410,0.209463,-0.307495,-0.219263,0.111309,0.494184,0.264251,-0.555974,-0.257724,...,0.628443,-0.249435,-0.196823,0.326543,0.046936,0.220092,0.472319,-1.158151,0.262810,-0.404702
5192,0.373302,0.013885,-0.662630,-0.149125,-0.507021,0.360729,0.174792,-0.062389,-0.229902,-1.087377,...,0.446199,-1.459899,-0.174872,0.600132,-0.098209,-0.443329,0.161391,-0.469137,0.048491,-0.057459


# 4. TF-IDF

The following sections performs these steps:

- 4.1 Create a tokenizer function using Spacy.
- 4.2 Create the embeddings using TF-IDF.
- 4.3 Get into an array the matrix created by the TF-IDF method.

## 4.1 Create a tokenizer 

In [13]:
# Load the Spanish model for accuracy
import spacy

nlp = spacy.load("es_dep_news_trf")

In [14]:
# Tokenize each word using the Spacy tokenizer, and save the tokens into a list
def my_tokenizer(sentence):
    toks = []
    mytokens = nlp(sentence)
    for token in mytokens:
        toks.append(token.text) 
    return toks

## 4.2 Create the TF-IDF embeddings

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, lowercase=False)
column = vectorizer.fit_transform(df['Opinion'])

## 4.3 Turn the matrix with the TF-IDF embeddings into a matrix

In [16]:
matrix = column.toarray()

# 5. Combine the embeddings and pass them through a classifier

- 5.1 Create a new DataFrame. The columns contain the embeddings (from BETO and the embeddings from TF-IDF), and the rows the indexes that correspond to the reviews in the training set.
- 5.2 Train the classifier. In this case, logistic regression.

## 5.1 Create a new DataFrame with the embeddings from BETO and the embeddings from TF-IDF

In [17]:
df_final_embeddings = pd.concat([df_BERT, pd.DataFrame(matrix)], axis=1)
df_final_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13880,13881,13882,13883,13884,13885,13886,13887,13888,13889
0,-0.100722,-0.513502,-0.111594,0.448818,-0.466021,0.142489,0.228598,0.030338,-0.365344,-0.200068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.537293,0.437717,0.259103,0.366199,0.062330,0.693132,0.476319,-0.106528,-0.266705,0.396034,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.177042,0.283975,-0.306809,-0.246421,-0.075344,0.137682,0.173271,0.732921,-0.387721,-0.271484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.168886,-0.239445,-0.369098,0.286953,0.280967,0.066748,0.244342,-0.246206,-0.279504,-0.197400,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.037849,-0.249693,-0.692661,0.079971,-0.354050,-0.024417,-0.464972,-0.335644,-0.578367,-0.438497,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5189,0.427815,-0.066462,-0.182661,-0.182542,-0.367275,0.482871,0.571338,0.057494,-0.570252,-0.295871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5190,0.342039,0.543797,0.486739,0.678449,0.032057,0.245350,0.167865,-0.125825,-0.457089,-0.376417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5191,0.867902,0.389410,0.209463,-0.307495,-0.219263,0.111309,0.494184,0.264251,-0.555974,-0.257724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5192,0.373302,0.013885,-0.662630,-0.149125,-0.507021,0.360729,0.174792,-0.062389,-0.229902,-1.087377,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5.2 Train the classifier (logistic regression)

In [18]:
# Import the necessary modules for the logistic regression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline

# Create a classifier object with multiclass regression
classifier = LogisticRegressionCV()

# Create a pipeline object
pipe = Pipeline([('classifier', classifier)])

# Training
pipe.fit(df_final_embeddings, df['Label'])

Pipeline(memory=None,
         steps=[('classifier',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='auto',
                                      n_jobs=None, penalty='l2',
                                      random_state=None, refit=True,
                                      scoring=None, solver='lbfgs', tol=0.0001,
                                      verbose=0))],
         verbose=False)

# 6. Class predictions on the official evaluation set

- 6.1 Load the test set.
- 6.2 Pre-process the dataset before doing the prediction of the classes using the model trained above. This includes removing the quotes in the reviews and getting this set's embeddings (as done above).
- 6.3 Predict the classes for the reviews.
- 6.4 Create the final output file (.txt).

## 6.1 Load the test set


In [19]:
df_pred = pd.read_excel("'/PATH.xlsx")
df_pred

Unnamed: 0,index,Title,Opinion,Place,Gender,Age,Country,Date
0,1,"""Sucio""",La verdad esperaba mucho más de éste recinto. ...,Casa de Diego Rivera,Female,-1,México,2016
1,2,"""NO LO VISITEN LOS 19 DE MARZO CERRADO""","Guanajuato tiene todo , no podían faltar los m...",Casa de Diego Rivera,N/I,-1,México,2016
2,3,"""Horrible""",The Diego Rivera house gives you a good idea o...,Casa de Diego Rivera,Male,-1,México,2016
3,4,No vale la pena perder ni un minuto,Interesante. Se pueden apreciar muebles de la ...,Casa de Diego Rivera,Female,-1,N/I,2016
4,5,"""Ir a Guanajuato y que este cerrado""",En si hay muy poco de Diego Rivera.. pero quis...,Casa de Diego Rivera,N/I,-1,México,2016
...,...,...,...,...,...,...,...,...
2211,2212,"""Históricamente recomendable""","""El término alhóndiga proviene del árabe y sig...",Alhóndiga,Female,25,México,2018-05-29 00:00:00
2212,2213,"""Vista a Guanajuato""","""Conociendo la historia de Juan José de los Re...",Monumento Pípila,Female,28,Estados Unidos,2018-06-18 00:00:00
2213,2214,"""Increíble vista de Guanajuato""","""Fue construida a finales del siglo XVIII, en ...",Alhóndiga,Male,59,México,2018-06-19 00:00:00
2214,2215,"""¡Lugar icónico y romántico!""","""Parada obligatoria en tu visita a la ciudad d...",Alhóndiga,Male,32,México,2018-06-19 00:00:00


## 6.2 Pre-process the dataset 

In [20]:
remove_quotes(df_pred)

In [21]:
# Tokenize all of the sentences and map the tokens to their word IDs
opiniones = df_pred['Opinion'].values

input_ids = []
attention_masks = []

for opinion in opiniones:
    encoded_dict = tokenizer.encode_plus(
                        opinion,                   
                        add_special_tokens = True, 
                        max_length = 512,           
                        truncation = True,
                        padding='max_length',
                        return_attention_mask = True,  
                        return_tensors = 'pt',     
                        verbose = True 
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Create a DataLoader for training the model with the tensors created above
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(
            dataset,  
            sampler = SequentialSampler(dataset), # Select the batches sequentially
            batch_size = 1)

output_list = []
for step, batch in enumerate(dataloader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        outputs = model(input_ids, 
                        token_type_ids=None, 
                        attention_mask=attention_mask, 
                        return_dict=True)
        output_list.append(outputs.last_hidden_state[0].tolist())
        
# Create a list with the embeddings from BETO
embeddings = []
for i in range(len(output_list)):
    embeddings.append(output_list[i][0])

df_BERT_pred = pd.DataFrame(embeddings)

In [22]:
# The vectorizer object has already been trained, so now we only get the TF-IDF features of the prediction set 
tfidf_pred = vectorizer.transform(df_pred['Opinion'])
matrix_pred = tfidf_pred.toarray()

df_final = pd.concat([df_BERT_pred, pd.DataFrame(matrix_pred)], axis=1)
df_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13880,13881,13882,13883,13884,13885,13886,13887,13888,13889
0,0.195528,0.156670,-0.443622,0.195192,-0.760902,-0.265523,0.369810,0.474678,-0.663961,-0.641090,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.590135,-0.458063,0.175340,0.684648,-0.331125,0.124798,0.532284,0.546939,-0.789807,-0.194467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.411958,-0.111570,0.235532,-0.402516,0.264954,-0.175784,1.191339,0.508916,-0.873340,-0.054844,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.536105,0.109472,-0.320521,-0.369643,0.129396,0.469136,0.355744,0.210789,-0.140114,-0.426512,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.451311,-0.205837,0.260806,0.483721,-0.336173,-0.285577,0.681388,0.737064,-0.445331,-0.103577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2211,0.198151,0.589778,-0.028214,-0.127158,-0.427993,0.039850,-0.579404,-0.516421,-0.483683,-0.441637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2212,1.024327,0.122803,0.160388,-0.130503,-0.410822,-0.270486,0.014006,0.370058,-0.580365,-0.076135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2213,0.646842,0.397886,-0.046559,0.182240,-0.777834,-0.474051,0.176925,-0.582197,-0.348780,-0.854576,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2214,0.849721,0.358379,-0.159667,-0.106498,-0.298375,0.457378,0.527721,0.455168,-0.958972,-0.276436,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 6.3 Prediction of the classes

In [23]:
predictions = pipe.predict(df_final)

## 6.4 Create the final output file (.txt)

In [24]:
df_output = pd.DataFrame(predictions, columns=['Label'])

In [25]:
# Lists that contain the predictions and the indexes of each review
classes = []
for i in df_output['Label']:
    classes.append(str(i))

indexes = []
for i in range(len(predictions)):
    indexes.append(str(i))

In [28]:
# Create the final output
with open('output_file.txt', 'w') as f:
    for clas, i in zip(classes, indexes):
        print('"sentiment"\t'+ '"' + i +  '"'+ "\t" +  '"'+ clas +  '"'+ "\n", file=f)