[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1Cs3lrvLtbGqZ-pDcXAzdDXyp2lHSctrj/view?usp=sharing)

## Install dependencies

In [None]:
!pip install transformers --quiet

In [1]:
from transformers import pipeline, logging

logging.set_verbosity_error()
import pandas as pd
import numpy as np

In [2]:
# Add dataset path
path = "./data/"
folder = "raw/"
filename = "Rest_Mex_Sentiment_Analysis_2023_Train.xlsx"
# reading the data
df = pd.read_excel(path + folder + filename)

### Name Entity Recognition

In [3]:
lengths = df["Review"].str.len()

review_lenghts = pd.DataFrame({"Polarity": df["Polarity"], "length": lengths})
mn = int(np.mean(review_lenghts["length"]))

# ner_tagger = pipeline("ner", aggregation_strategy="simple")
ner_tagger = pipeline(
    "ner",
    model="jplu/tf-xlm-r-ner-40-lang",
    aggregation_strategy="simple",
    tokenizer=("jplu/tf-xlm-r-ner-40-lang", {"use_fast": True}),
    framework="tf",
)
# translator = pipeline("translation_es_to_en", model="Helsinki-NLP/opus-mt-es-en")


def ner_extract_all(text):
    # text = translator(text, clean_up_tokenization_spaces=True, min_length=100)[0]['translation_text']
    output = ner_tagger(text)
    return pd.DataFrame(output)


print(f"1) Example of use.")
print("-------------------------------------------------------")
ner_extract_all(df.iloc[0, 1])

1) Example of use.
-------------------------------------------------------


Unnamed: 0,entity_group,score,word,start,end
0,ORG,0.537052,Botero,39,46


In [4]:
def create_location(x):
    try:
        result = ner_extract_all(x)
        if len(result) >= 1:
            return result.where(result["entity_group"] == "LOC").dropna().sample(1)["word"]
        else:
            return "None"
    except:
        return "None"

In [5]:
# we carried out a representative sampling using the following:
# https://online.stat.psu.edu/stat415/lesson/6/6.3


def n_(N, Z=1.96, e=0.01, p=0.5):
    return (N * (Z**2) * p * (1 - p)) / ((e**2) * (N - 1) + (Z**2) * p * (1 - p))


def to_weights(x):
    return 1.0 / x


def check_bool(x, value):
    if str(x).find(value) != -1:
        return True
    else:
        return False

$n = \frac{N\cdot Z_{\alpha}^{2}\cdot p\cdot \left ( 1-p \right )}{\epsilon^{2}\cdot \left ( N-1 \right )+Z_{\alpha }^{2}\cdot p\cdot \left ( 1-p \right )}$

In [6]:
# we iterate over the rows and save the results
n = int(n_(len(df)))
mode = True

if mode:
    print(f"Stratified sampling mode : ")
    df["freq"] = df.groupby("Polarity")["Polarity"].transform("count")
    df_sample = df.sample(n=n, replace=False, weights=df["freq"].apply(to_weights))
else:
    df_sample = df.sample(n)
print(f"Total size of the sample : ", "{:,}".format(len(df_sample)))
print("-------------------------------------------------------")
df_sample["LOC"] = df_sample["Review"].apply(create_location)

Stratified sampling mode : 
Total size of the sample :  9,251
-------------------------------------------------------


In [7]:
print(f"The values of the first 15 rows are : ")
df_sample.head(15)

The values of the first 15 rows are : 


Unnamed: 0,Title,Review,Polarity,Country,Type,freq,LOC
114333,No lo puedo recomendar,Un lugar muy poco cuidado. El aire acondiciona...,1,Cuba,Restaurant,5772,
84963,Tan bien decorada!,Que realmente se vuelve fascinante museo de hi...,5,Colombia,Attractive,157095,
17085,el lugar esta muy pequeño,Lastima que el dia que llegué estaba lloviendo...,3,Colombia,Attractive,21656,
61887,Necesita mejorar,"las habitaciones son viejas, mal aisladas, con...",3,Cuba,Hotel,21656,
12402,Con sabor a historia,Si te gusta la historia y el pasado de una ciu...,4,Mexico,Attractive,60227,
225413,Fue agradable!,Está bien. Acabamos de volver de nuestro prime...,4,Mexico,Hotel,60227,
160584,"Hotel histórico, pero un poco cansado",La entrada es preciosa y los bonitos jardines....,3,Colombia,Hotel,21656,"0 Toucan Name: word, dtype: object"
154202,Pasará por allí,trae un poco de pan con que está llena de aves...,3,Cuba,Attractive,21656,"0 la Habana vieja Name: word, dtype: object"
41841,Una pesadilla,"Es el peor hotel que he visto, viejo, sucio, s...",1,Cuba,Hotel,5772,
42039,No recomendable,"A primera vista parece un buen hotel, la ubica...",1,Mexico,Hotel,5772,"0 Puebla Name: word, dtype: object"


In [8]:
n_count = len(df_sample.where(df_sample["LOC"].apply(check_bool, value="None")).dropna())
print(f"Number of times we get the tag 'None' : ", "{:,}".format(n_count))
print("-------------------------------------------------------")

Number of times we get the tag 'None' :  5,439
-------------------------------------------------------
