# Explore here

In [171]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
total_data = pd.read_csv(url, sep= ',')  

total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [157]:
target = 'review'
# Variables numéricas
numericas = total_data.select_dtypes(include=['number']).columns.tolist()
print("Variables numéricas:", numericas)

# Variables no numéricas (categóricas, texto, etc.)
no_numericas = total_data.select_dtypes(exclude=['number']).columns.tolist()
print("Variables no numéricas:", no_numericas)

Variables numéricas: ['polarity']
Variables no numéricas: ['package_name', 'review']


In [158]:
total_data["review"] = total_data["review"].str.strip().str.lower()
total_data


Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offlin...,0
1,com.facebook.katana,"messenger issues ever since the last update, i...",0
2,com.facebook.katana,profile any time my wife or anybody has more t...,0
3,com.facebook.katana,the new features suck for those of us who don'...,0
4,com.facebook.katana,forced reload on uploading pic on replying com...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it ...,1
887,com.rovio.angrybirds,all time legendary game the birthday party lev...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad reviews...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoyi...,1


In [159]:
total_data.shape

(891, 3)

In [160]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [161]:
# VERIFICO SI HAY DATOS NULOS 
total_data.isnull().sum().sort_values(ascending=False)

package_name    0
review          0
polarity        0
dtype: int64

In [162]:
duplicados = total_data[total_data.duplicated(keep=False)]

print(duplicados)

                   package_name  \
212  com.supercell.clashofclans   
222  com.supercell.clashofclans   

                                                review  polarity  
212  this update sucks!!! i can't open the game any...         0  
222  this update sucks!!! i can't open the game any...         0  


In [163]:
#VARIABLES A ELIMINAR 
total_data.drop(['package_name'], axis = 1, inplace = True)
total_data.shape

(891, 2)

In [164]:
x = total_data.drop(columns=[target])
y = total_data[target]

x_train_total_data, x_test_total_data, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train_total_data)
print(x_test_total_data)


     polarity
331         0
733         0
382         0
704         1
813         1
..        ...
106         0
270         0
860         1
435         0
102         0

[712 rows x 1 columns]
     polarity
709         0
439         1
840         1
720         0
39          0
..        ...
433         0
773         0
25          0
84          0
10          0

[179 rows x 1 columns]


In [165]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(x_train_total_data).toarray()
X_test = vec_model.transform(x_test_total_data).toarray()

text_data = total_data['review']

X_counts = vec_model.fit_transform(text_data)

print("Matriz de recuento de palabras (shape):", X_counts.shape)
print(X_counts[:5,:10].toarray())

Matriz de recuento de palabras (shape): (891, 3721)
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [166]:
# Crear columna 'sentiment' aleatoria para ejemplo (positivo/negativo)
np.random.seed(42)
total_data['sentiment'] = np.random.choice(['positivo', 'negativo'], size=len(total_data))
total_data

Unnamed: 0,review,polarity,sentiment
0,privacy at least put some option appear offlin...,0,positivo
1,"messenger issues ever since the last update, i...",0,negativo
2,profile any time my wife or anybody has more t...,0,positivo
3,the new features suck for those of us who don'...,0,positivo
4,forced reload on uploading pic on replying com...,0,positivo
...,...,...,...
886,loved it i loooooooooooooovvved it because it ...,1,positivo
887,all time legendary game the birthday party lev...,1,negativo
888,ads are way to heavy listen to the bad reviews...,0,negativo
889,fun works perfectly well. ads aren't as annoyi...,1,positivo


In [167]:

# GaussianNB

total_data_GaussianNB = total_data.copy()

# Vectorizar texto
vectorizer = CountVectorizer()
X_counts_GaussianNB = vectorizer.fit_transform(total_data_GaussianNB['review'])

# Convertir matriz dispersa a densa para GaussianNB
X_dense_GaussianNB = X_counts_GaussianNB.toarray()

# Dividir datos
x_train, x_test, y_train, y_test = train_test_split(X_dense_GaussianNB, total_data_GaussianNB['sentiment'], test_size=0.2, random_state=42)

# Entrenar GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)

# Evaluar
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5586592178770949
Reporte de clasificación:
              precision    recall  f1-score   support

    negativo       0.57      0.60      0.58        92
    positivo       0.55      0.52      0.53        87

    accuracy                           0.56       179
   macro avg       0.56      0.56      0.56       179
weighted avg       0.56      0.56      0.56       179

              precision    recall  f1-score   support

    negativo       0.57      0.60      0.58        92
    positivo       0.55      0.52      0.53        87

    accuracy                           0.56       179
   macro avg       0.56      0.56      0.56       179
weighted avg       0.56      0.56      0.56       179



In [168]:
# MultinomialNB
total_data_MultinomialNB = total_data.copy()

# Vectorizar texto
vectorizer = CountVectorizer()
X_counts_MultinomialNB = vectorizer.fit_transform(total_data_MultinomialNB['review'])

# Dividir datos
x_train, x_test, y_train, y_test = train_test_split(
    X_counts_MultinomialNB, total_data_MultinomialNB['sentiment'], test_size=0.2, random_state=42)

# Entrenar MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train, y_train)
y_pred = mnb.predict(x_test)

# Evaluar
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))

Accuracy: 0.49162011173184356
Reporte de clasificación:
              precision    recall  f1-score   support

    negativo       0.51      0.51      0.51        92
    positivo       0.48      0.47      0.47        87

    accuracy                           0.49       179
   macro avg       0.49      0.49      0.49       179
weighted avg       0.49      0.49      0.49       179



In [169]:
# BernoulliNB

total_data_BernoulliNB = total_data.copy()

# Vectorizar texto
vectorizer = CountVectorizer(binary=True)  # BernoulliNB usa datos binarios
X_counts_BernoulliNB = vectorizer.fit_transform(total_data_BernoulliNB['review'])

# Dividir datos
x_train_BernoulliNB, x_test_BernoulliNB, y_train, y_test = train_test_split(
    X_counts_BernoulliNB, total_data_BernoulliNB['sentiment'], test_size=0.2, random_state=42)

# Entrenar BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_train_BernoulliNB, y_train)
y_pred = bnb.predict(x_test_BernoulliNB)

# Evaluar
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5139664804469274
Reporte de clasificación:
              precision    recall  f1-score   support

    negativo       0.52      0.60      0.56        92
    positivo       0.50      0.43      0.46        87

    accuracy                           0.51       179
   macro avg       0.51      0.51      0.51       179
weighted avg       0.51      0.51      0.51       179



Analisis de resultados: usaremos GaussianNB que tuvo un mayor accurasy con 56% de acertividad con datos ficticios solo de entrenamiento.

In [None]:
# Hiperparametrizacion de GaussianNB

# Definir el modelo
gnb = GaussianNB()


param_grid = {'var_smoothing': np.logspace(0,-9, num=100)}

# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Ejecutar búsqueda
grid_search.fit(x_train_dense, y_train_dense)

# Mejor parámetro y mejor score

print("Mejor accuracy en validación:", grid_search.best_score_)

# Evaluar en test con el mejor modelo
best_gnb = grid_search.best_estimator_
y_pred = best_gnb.predict(x_test_dense)
print("Accuracy en test:", accuracy_score(y_test_dense, y_pred))

Mejor accuracy en validación: 0.5280311238057717
Accuracy en test: 0.4860335195530726
