# 1. Logistic Regression 

In [1]:
# Load the official dataset
import pandas as pd

df_train = pd.read_csv('/001/usuarios/juanmvs/anaconda3/NLP/Datos/DatosOficiales.csv')
df_train

Unnamed: 0.1,Unnamed: 0,Title,Opinion,Place,Gender,Age,Country,Date,Label
0,0,"""¡Momias, demasiado impresionante!""","""Las momias están en muy buen estado de conser...",Museo de las Momias,Male,53,México,22/10/2016,1
1,1,Comida cara,Tienen carteles con comida cortida de 40 y no ...,Mercado Hidalgo,N/I,-1,México,2018,1
2,2,"""No coman ahí""","""Creo que es muy insalubre, hay basura por tod...",Mercado Hidalgo,Female,61,México,15/01/2013,1
3,3,"""Momificado""","""Para mí gusto no vale la pena... tristemente ...",Museo de las Momias,Male,38,Colombia,11/05/2017,1
4,4,"""Incómodo y cero romántico""","""Es un lugar poco interesante y que se conoce ...",Callejón del Beso,Female,38,Francia,28/11/2017,1
...,...,...,...,...,...,...,...,...,...
5189,5192,"""Verdadera joya arquitectónica""","""Es una construcción majestuosa, creo que de l...",Teatro Juárez,Male,68,México,24/02/2017,5
5190,5193,"""Romántico""","""Muy al estilo de Romeo y Julieta es este siti...",Callejón del Beso,Male,41,Colombia,31/10/2015,5
5191,5194,"""Parece un castillo""","""Ideal para subir las escalinatas y divisar su...",Universidad de Guanajuato,Male,41,México,12/11/2016,5
5192,5195,"""Imperdible""","""Es imperdible, de ahí puedes ver muy bien la ...",Monumento Pípila,Male,46,Chile,19/05/2017,5


In [2]:
# Function for removing quotes 
def remove_quotes(dataframe):
    nw = []
    for op in dataframe['Opinion']:
        nw.append(op.strip('"'))
    dataframe['Opinion'] = nw

In [3]:
# Remove the quotes in the reviews
remove_quotes(df_train)

In [4]:
# Spacy for tokenization and part of speech tagging
import spacy
import spacy_transformers 

nlp = spacy.load("es_dep_news_trf") # Copied from https://spacy.io/models for accuracy.

In [5]:
# Tokenizer function
def my_tokenizer(sentence):
    toks = []
    mytokens = nlp(sentence)
    for token in mytokens:
        toks.append(token.text) 
    return toks

In [6]:
# Split the data into train (80%) and test (20%) sets
import sklearn
from sklearn.model_selection import train_test_split

X = df_train['Opinion']
y = df_train['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create the word vectors that will be used for training the models. TFID is selected in this case
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, lowercase=True)

In [8]:
# Import the necessary modules for the logistic regression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline

# Create a classifier object with multiclass regression
classifier = LogisticRegressionCV(random_state=42)

# Create pipeline using the word vector created above
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

# Model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                                 tokenizer=<function my_tokenizer at 0x7f31fd083440>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=Fal

In [9]:
# Import the necessary modules for the evaluation
from sklearn import metrics
from sklearn.metrics import mean_absolute_error

# Predicting with the test dataset created above
predict = pipe.predict(X_test)

# Print the evaluation metrics
mae = mean_absolute_error(y_test, predict)

print('MAE: %.4f' % mae)

MAE: 0.7238


# 2. SVM 

In [10]:
# Import the necessary modules for the SVM
from sklearn import svm
from sklearn.pipeline import Pipeline

# Create a classifier object
classifier = svm.LinearSVC(random_state=42)

# Create pipeline using the word vector created above
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

# Model generation
pipe.fit(X_train, y_train)

# Prediction 
pred = pipe.predict(X_test)

# Print the evaluation metrics
mae = mean_absolute_error(y_test, pred)

print('MAE SVM: %.4f' % mae)

MAE SVM: 0.5592


# 3. Linear Regression 


In [11]:
# Import the necessary modules for the linear regression
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Create a classifier object 
classifier = LinearRegression()

# Create pipeline using the word vector created above
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

# Model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function my_tokenizer at 0x7f31fd083440>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                  

In [12]:
import numpy as np

# Prediction 
pred = pipe.predict(X_test)
pred = np.rint(pred)

# Print the evaluation metric
mae = mean_absolute_error(y_test, pred)

print('MAE Linear regression: %.4f' % mae)

MAE Linear regression: 0.8499


# 4. Linear Regression - ElastNet

In [13]:
from sklearn.linear_model import ElasticNet

# Create a classifier object 
classifier = ElasticNet(alpha=0.01, l1_ratio=0.9, selection='random', random_state=42)

# Create pipeline using the word vector created above
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

# Model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function my_tokenizer at 0x7f31fd083440>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True,
                         

In [14]:
pred = pipe.predict(X_test)
pred = np.rint(pred)

mae = mean_absolute_error(y_test, pred)
print('MAE ElasticNet regression: %.4f' % mae)

MAE ElasticNet regression: 0.7709


# 5. Linear Regression - Lasso

In [15]:
from sklearn.linear_model import Lasso

# Create a classifier object 
classifier = Lasso(alpha=0.01, precompute=True, positive=True, selection='random', random_state=42)

# Create pipeline using the word vector created above
pipe = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])

# Model generation
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function my_tokenizer at 0x7f31fd083440>,
                                 use_idf=True, vocabulary=None)),
                ('classifier',
                 Lasso(alpha=0.01, copy_X=True, fit_intercept=True,
                       max_ite

In [16]:
pred = pipe.predict(X_test)
pred = np.rint(pred)

mae = mean_absolute_error(y_test, pred)
print('MAE Lasso regression: %.4f' % mae)

MAE Lasso regression: 0.7709
