# Reto 2 - Máquinas de Vectores de Soporte
**Nombre:** Juan Manuel Gutiérrez Gómez  **Código:** 2260563

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt

In [2]:
data_source_url = "Rest_Mex_2022_Sentiment_Analysis_Track_Train.csv"
tourist_opinions = pd.read_csv(data_source_url)

In [3]:
tourist_opinions.head()

Unnamed: 0,Title,Opinion,Polarity,Attraction
0,Pésimo lugar,"Piensen dos veces antes de ir a este hotel, te...",1,Hotel
1,No vayas a lugar de Eddie,Cuatro de nosotros fuimos recientemente a Eddi...,1,Restaurant
2,Mala relación calidad-precio,seguiré corta y simple: limpieza\n- bad. Tengo...,1,Hotel
3,Minusválido? ¡No te alojes aquí!,Al reservar un hotel con multipropiedad Mayan ...,1,Hotel
4,Es una porqueria no pierdan su tiempo,"No pierdan su tiempo ni dinero, venimos porque...",1,Hotel


## Objetivo 1: Opinión vs Atracción

**Conjunto de Características**

Extraemos las características que analizaremos con el siguiente script:

In [4]:
features = tourist_opinions.iloc[:, 1].values
labels = tourist_opinions.iloc[:, 3].values
labels

array(['Hotel', 'Restaurant', 'Hotel', ..., 'Attractive', 'Attractive',
       'Attractive'], dtype=object)

Una vez que dividimos los datos en características y conjunto de entrenamiento, podemos preprocesarlos para limpiarlos. Para ello, utilizaremos expresiones regulares. Para obtener más información sobre las expresiones regulares:

In [5]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

### TF-IDF
Aplicamos el algoritmo de TF-IDF. La idea detrás del enfoque TF-IDF es que las palabras que aparecen menos en todos los documentos y más en un documento individual contribuyen más a la clasificación.

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('spanish'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JuanMa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


En aras de probar la salida de nuestro clasificador, dividiremos los datos en un conjunto de entrenamiento y prueba:

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

### SVM: Kernel Lineal

In [8]:
from sklearn.svm import SVC

model = SVC(kernel='linear', C=1E10)
model.fit(X_train, y_train)

SVC(C=10000000000.0, kernel='linear')

In [9]:
predictions = model.predict(X_test)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[1041    4   10]
 [  21 3128   99]
 [  25  106 1609]]
              precision    recall  f1-score   support

  Attractive       0.96      0.99      0.97      1055
       Hotel       0.97      0.96      0.96      3248
  Restaurant       0.94      0.92      0.93      1740

    accuracy                           0.96      6043
   macro avg       0.95      0.96      0.96      6043
weighted avg       0.96      0.96      0.96      6043

0.9561476088035744


### SVM: Kernel RBF (Función de Base Radial)

In [11]:
model = SVC(kernel='rbf', C=1E10)
model.fit(X_train, y_train)

SVC(C=10000000000.0)

In [12]:
predictions = model.predict(X_test)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[1043    2   10]
 [   7 3182   59]
 [   2   59 1679]]
              precision    recall  f1-score   support

  Attractive       0.99      0.99      0.99      1055
       Hotel       0.98      0.98      0.98      3248
  Restaurant       0.96      0.96      0.96      1740

    accuracy                           0.98      6043
   macro avg       0.98      0.98      0.98      6043
weighted avg       0.98      0.98      0.98      6043

0.9769981797120636


### SVM: Kernel Polinomial

In [14]:
model = SVC(kernel='poly', C=1E10)
model.fit(X_train, y_train)

SVC(C=10000000000.0, kernel='poly')

In [15]:
predictions = model.predict(X_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[1051    1    3]
 [ 560 2670   18]
 [ 577   61 1102]]
              precision    recall  f1-score   support

  Attractive       0.48      1.00      0.65      1055
       Hotel       0.98      0.82      0.89      3248
  Restaurant       0.98      0.63      0.77      1740

    accuracy                           0.80      6043
   macro avg       0.81      0.82      0.77      6043
weighted avg       0.89      0.80      0.81      6043

0.7981135197749463


### SVM: Kernel Función Sigmoide

In [17]:
from sklearn.svm import SVC

model = SVC(kernel='sigmoid', C=1E10)
model.fit(X_train, y_train)

SVC(C=10000000000.0, kernel='sigmoid')

In [18]:
predictions = model.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[1012   18   25]
 [  46 3098  104]
 [  55  109 1576]]
              precision    recall  f1-score   support

  Attractive       0.91      0.96      0.93      1055
       Hotel       0.96      0.95      0.96      3248
  Restaurant       0.92      0.91      0.91      1740

    accuracy                           0.94      6043
   macro avg       0.93      0.94      0.94      6043
weighted avg       0.94      0.94      0.94      6043

0.9409233824259474


## Objetivo 2: Opinión vs Sentimiento

**Conjunto de Características**

Extraemos las características que analizaremos con el siguiente script:

In [3]:
features = tourist_opinions.iloc[:, 1].values
labels = tourist_opinions.iloc[:, 2].values
labels

array([1, 1, 1, ..., 5, 5, 5], dtype=int64)

In [4]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [5]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('spanish'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JuanMa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

### SVM: Kernel Lineal

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='linear', C=1E4)
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

### SVM: Kernel RBF (Función de Base Radial)

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1E1)
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

### SVM: Kernel Polinomial

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='poly', C=1E4)
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

### SVM: Kernel Función Sigmoide

In [7]:
from sklearn.svm import SVC

model = SVC(kernel='sigmoid', C=1E10)
model.fit(X_train, y_train)

SVC(C=10000000000.0, kernel='sigmoid')

In [8]:
predictions = model.predict(X_test)

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[  36   25   21   14    8]
 [  23   38   46   19   19]
 [  40   54  114  110  104]
 [  30   37  136  355  605]
 [  38   51  126  531 3463]]
              precision    recall  f1-score   support

           1       0.22      0.35      0.27       104
           2       0.19      0.26      0.22       145
           3       0.26      0.27      0.26       422
           4       0.34      0.31      0.32      1163
           5       0.82      0.82      0.82      4209

    accuracy                           0.66      6043
   macro avg       0.37      0.40      0.38      6043
weighted avg       0.67      0.66      0.66      6043

0.6629157703127586
