# Análisis de Sentimientos utilizando Comentarios de Facebook

## Cargar el archivo
Se extrajeron 1,569 comentarios de septiembre y octubre año 2020 en los que se menciona a CitiBanamex

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')
stop_words_sp = set(stopwords.words('spanish'))

In [None]:
df = pd.read_csv('../data/BanamexFace.csv')
df.head()

In [None]:
lisco = df.columns
lisco

In [None]:
df.sent.value_counts()

#### Limpieza del texto (puntuación, tokenización y eliminado de stopwords en español)

In [None]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stop_words_sp]
    return text

#### Creamos la Matriz Documento-Término (DTM) con puntuaciones TF-IDF

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['com'])

In [None]:
X_features = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
X_features.head()

In [None]:
X_features.info()

#### Removemos las columnas de términos redundantes y los que son poco frecuentes

In [None]:
column = pd.DataFrame(X_features.sum())
column.head()

In [None]:
column[column[0]>20].index

In [None]:
X_features.drop(columns=['', 'banamex', 'citibanamex'], inplace=True)

In [None]:
X_features.drop(columns=['alta', 'anualidaaaaaaaaaaaaad', 'cajeros', 'cobrando', 'créditos', 'liquidar', 'llega', 'mínimo', 'plática', 'presentación', 'puedan', 'quiten', 'saludos', 'tips', 'toño', 'x'], inplace=True)

In [None]:
stop = list(column[column[0]<3].index)

In [None]:
len(stop)

In [None]:
X_features.drop(columns=stop, inplace=True)

In [None]:
X_features.info()

#### Creamos un Clasificador de Comentarios utilizando RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

Particionamos las muestras

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features,
                                                    df['sent'],
                                                    test_size=0.2)

Entrenamos el modelo y generamos las predicciones sobre la muestra de evaluación

In [None]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

#### Métricas de Diagnóstico

In [None]:
print('Matriz de Confusión:')
confusion_matrix(y_test, y_pred)

In [None]:
print('Accuracy: %.2f%%' %(accuracy_score(y_test, y_pred)*100))  
print('Precision: %.2f%%' % (precision_score(y_test, y_pred, average= 'weighted')*100))
print('Recall: %.2f%%' % (recall_score(y_test, y_pred, average= 'weighted')*100))

#### También podemos probar con un modelo Naive Bayes Gausiano

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB(priors=None)
nb_model = nb.fit(X_train, y_train)
y_pred = nb_model.predict(X_test)

In [None]:
print('Matriz de Confusión:')
confusion_matrix(y_test, y_pred)

In [None]:
print('Accuracy: %.2f%%' %(accuracy_score(y_test, y_pred)*100))  
print('Precision: %.2f%%' % (precision_score(y_test, y_pred, average= 'weighted')*100))
print('Recall: %.2f%%' % (recall_score(y_test, y_pred, average= 'weighted')*100))

#### Ahora podemos implementar el mejor modelo, utilizando el otro conjunto de datos

In [None]:
dfs = pd.read_csv('../data/BanamexFaceNew.csv')
dfs.head()

Preparar los datos para el scoring

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
S_tfidf = tfidf_vect.fit_transform(dfs['com'])

In [None]:
S_features = pd.DataFrame(S_tfidf.toarray(), columns=tfidf_vect.get_feature_names())
S_features = S_features[X_features.columns]
S_features.info()

Aplicamos el modelo

In [None]:
y_score = rf_model.predict(S_features)
dfs['sent_model'] = y_score
dfs.head(20)

Elaborado por Luis Cajachahua bajo licencia MIT (2022)