# Análisis de Sentimientos utilizando Comentarios de Facebook

## Cargar el archivo
Se extrajeron 1,569 comentarios de septiembre y octubre año 2020 en los que se menciona a CitiBanamex

In [44]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
import string
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
stop_words_sp = set(stopwords.words('spanish'))

In [45]:
df = pd.read_csv('../data/BanamexFace.csv')
df.head()

Unnamed: 0,pub,usr,likes,com,fec,hor,sent,tag
0,20200805,Dayana Tinajero,18,"no inventen, acabo de llamar al call center y ...",6/08/2020,11:37:00,n,seguridad
1,20200805,Salma Madrid,5,"pesimo servicio de call center, contestan con ...",6/08/2020,12:56:00,n,atencion_call
2,20200805,Dayana Tinajero,3,"mejor voy a sucursal, ya no me da confianza na...",6/08/2020,13:20:00,n,seguridad
3,20200805,Lissy Jimenez,0,citibanamex podrían decirme dónde puedo llamar...,6/08/2020,16:04:00,o,consulta
4,20200805,Lissy Jimenez,0,citibanamex gracias si me gustaría saber cuand...,6/08/2020,16:23:00,o,consulta


In [46]:
df.sent.value_counts()

o    834
n    525
p    209
Name: sent, dtype: int64

#### Limpieza del texto (puntuación, tokenización y eliminado de stopwords en español)

In [47]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stop_words_sp]
    return text

#### Creamos la Matriz Documento-Término (DTM) con puntuaciones TF-IDF

In [48]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['com'])

In [49]:
X_features = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
X_features.head()

Unnamed: 0,Unnamed: 1,0,00,000001801410,000001857311,001q,0040,005717,01,01q,...,ñoño,órdenes,ósea,últimamente,último,últimos,única,únicas,único,únicos
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.149853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
X_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1568 entries, 0 to 1567
Columns: 4331 entries,  to únicos
dtypes: float64(4331)
memory usage: 51.8 MB


#### Removemos las columnas de términos redundantes y los que son poco frecuentes

In [51]:
column = pd.DataFrame(X_features.sum())
column.head()

Unnamed: 0,0
,36.406436
0.0,0.751896
0.0,0.68268
1801410.0,0.268676
1857311.0,0.620076


In [52]:
X_features.drop(columns=['', 'citibanamex', 'abierta', 'abren', 'aclarar', 'alta', 'amado', 'amo', 'anualidaaaaaaaaaaaaad', 'anualidades', 'asco', 'audiomático', 'cajeros', 'cambiar', 'casi', 'cdmx', 'cobrando', 'convenio', 'créditos', 'debito', 'deja', 'desparcializar', 'enviar', 'hizo', 'inbox', 'liquidar', 'llega', 'martinez', 'mínimo', 'paso', 'plática', 'presentación', 'pude', 'puedan', 'quieres', 'quiten', 'regresen', 'responder', 'saludos', 'sola', 'sábado', 'targeta', 'tips', 'toño', 'transferir', 'transmisión', 'unan', 'ventanilla', 'viene', 'x'], inplace=True)

In [53]:
stop = list(column[column[0]<2].index)

In [54]:
len(stop)

3886

In [55]:
X_features.drop(columns=stop, inplace=True)

In [56]:
X_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1568 entries, 0 to 1567
Columns: 395 entries, 1 to vía
dtypes: float64(395)
memory usage: 4.7 MB


#### Aplicamos la descomposición SVD para poder trabajar con menos variables (20)

In [57]:
truncatedSVD=TruncatedSVD(30)
X_svd = truncatedSVD.fit_transform(X_features)
X_svd[:10] 

array([[ 6.55030221e-03,  1.22901372e-01, -2.98777583e-02,
         6.06886589e-02, -8.06483028e-02, -3.61852453e-02,
        -1.45412396e-02,  6.16875570e-03,  5.00210942e-02,
        -2.86536362e-02,  9.43614078e-03, -3.44176344e-03,
         9.14240993e-03, -2.78784909e-02,  2.78876059e-04,
        -1.72081179e-02, -3.77846780e-02, -7.11666084e-05,
         8.71184551e-03,  5.80573102e-03,  2.74970151e-02,
        -1.78604941e-02, -1.50724751e-02,  5.33141724e-03,
        -9.39213484e-04, -3.37833887e-02,  4.10132190e-02,
         6.53265320e-03, -1.80648729e-02, -3.19168263e-02],
       [ 4.06549543e-03,  7.78607124e-02, -3.91862782e-02,
         1.04816150e-01, -9.87417441e-03,  1.54801059e-01,
         3.27816592e-02,  2.48766787e-02, -4.94173098e-02,
        -6.42840802e-02, -5.51828660e-02, -7.23429395e-02,
         1.99221387e-02,  1.70595402e-02,  1.84685135e-02,
         1.49938279e-01, -5.13733935e-02, -6.96928771e-02,
        -3.42011878e-02, -3.81419786e-02,  5.09494704e-

#### Creamos un Clasificador de Comentarios utilizando RandomForest

In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

Particionamos las muestras

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_svd,
                                                    df['sent'],
                                                    test_size=0.2)

Entrenamos el modelo y generamos las predicciones sobre la muestra de evaluación

In [60]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

#### Métricas de Diagnóstico

In [61]:
print('Matriz de Confusión:')
confusion_matrix(y_test, y_pred)

Matriz de Confusión:


array([[ 71,  33,   1],
       [ 22, 143,   5],
       [  2,   8,  29]], dtype=int64)

In [62]:
print('Accuracy: %.2f%%' %(accuracy_score(y_test, y_pred)*100))  
print('Precision: %.2f%%' % (precision_score(y_test, y_pred, average= 'weighted')*100))
print('Recall: %.2f%%' % (recall_score(y_test, y_pred, average= 'weighted')*100))

Accuracy: 77.39%
Precision: 77.36%
Recall: 77.39%


#### También podemos probar con un modelo Naive Bayes Gausiano

In [63]:
from sklearn.naive_bayes import GaussianNB

In [64]:
nb = GaussianNB(priors=None)
nb_model = nb.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

In [65]:
print('Matriz de Confusión:')
confusion_matrix(y_test, y_pred)

Matriz de Confusión:


array([[79, 22,  4],
       [66, 97,  7],
       [ 2,  7, 30]], dtype=int64)

In [66]:
print('Accuracy: %.2f%%' %(accuracy_score(y_test, y_pred)*100))  
print('Precision: %.2f%%' % (precision_score(y_test, y_pred, average= 'weighted')*100))
print('Recall: %.2f%%' % (recall_score(y_test, y_pred, average= 'weighted')*100))

Accuracy: 65.61%
Precision: 68.74%
Recall: 65.61%


#### Ahora podemos implementar el mejor modelo, utilizando el otro conjunto de datos

In [67]:
dfs = pd.read_csv('../data/BanamexFaceNew.csv')
dfs.head()

Unnamed: 0,pub,usr,likes,com,fec,hor
0,20200918,Martin Bernal,0,Desde el día 27 de agosto solicité reposición ...,20/09/2020,13:20:00
1,20200918,Irma Treviño,0,Como localizo mi reposicion de tarjeta citiban...,20/09/2020,10:36:00
2,20200918,Francisco León,0,"Citibanamex, no me respetaron la promoción del...",20/09/2020,10:20:00
3,20200918,Manolo Ruiz Farrera,0,Muchísimas gracias citibanamex,20/09/2020,09:57:00
4,20200918,Iv San Roman De Proudinat,0,Mil gracias por esta platica,20/09/2020,07:28:00


Preparar los datos para el scoring

In [68]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
S_tfidf = tfidf_vect.fit_transform(dfs['com'])

In [69]:
S_features = pd.DataFrame(S_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
S_features = S_features[X_features.columns]
S_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Columns: 395 entries, 1 to vía
dtypes: float64(395)
memory usage: 1.1 MB


In [70]:
XS_svd = truncatedSVD.fit_transform(S_features)

Aplicamos el modelo

In [71]:
y_score = rf_model.predict(XS_svd)
dfs['sent_model'] = y_score
dfs.head(20)

Unnamed: 0,pub,usr,likes,com,fec,hor,sent_model
0,20200918,Martin Bernal,0,Desde el día 27 de agosto solicité reposición ...,20/09/2020,13:20:00,n
1,20200918,Irma Treviño,0,Como localizo mi reposicion de tarjeta citiban...,20/09/2020,10:36:00,o
2,20200918,Francisco León,0,"Citibanamex, no me respetaron la promoción del...",20/09/2020,10:20:00,o
3,20200918,Manolo Ruiz Farrera,0,Muchísimas gracias citibanamex,20/09/2020,09:57:00,p
4,20200918,Iv San Roman De Proudinat,0,Mil gracias por esta platica,20/09/2020,07:28:00,p
5,20200918,Oscar Javier Aguilar,0,Necesito ayuda recibí una invitación para una ...,18/09/2020,09:52:00,n
6,20200918,Oscar Javier Aguilar,0,"Citibanamex si, me dieron el apoyo por 4 meses...",18/09/2020,10:27:00,n
7,20200918,Oscar Javier Aguilar,0,"Citibanamex Citibanamex si lo hice en linea, y...",18/09/2020,11:13:00,n
8,20200918,Oscar Javier Aguilar,0,Citibanamex necesitas mi numero de folio ?,18/09/2020,11:36:00,o
9,20200918,Oscar Javier Aguilar,0,"Citibanamex No, para darle seguimiento a la e...",18/09/2020,13:50:00,o


Elaborado por Luis Cajachahua bajo licencia MIT (2022)