# Explore here

In [113]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score , classification_report
from sklearn.model_selection import GridSearchCV

In [114]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
total_data = pd.read_csv(url, sep= ',')  

total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [115]:
target = 'review'
# Variables numéricas
numericas = total_data.select_dtypes(include=['number']).columns.tolist()
print("Variables numéricas:", numericas)

# Variables no numéricas (categóricas, texto, etc.)
no_numericas = total_data.select_dtypes(exclude=['number']).columns.tolist()
print("Variables no numéricas:", no_numericas)

Variables numéricas: ['polarity']
Variables no numéricas: ['package_name', 'review']


In [116]:
total_data["review"] = total_data["review"].str.strip().str.lower()
total_data


Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offlin...,0
1,com.facebook.katana,"messenger issues ever since the last update, i...",0
2,com.facebook.katana,profile any time my wife or anybody has more t...,0
3,com.facebook.katana,the new features suck for those of us who don'...,0
4,com.facebook.katana,forced reload on uploading pic on replying com...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it ...,1
887,com.rovio.angrybirds,all time legendary game the birthday party lev...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad reviews...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoyi...,1


In [117]:
total_data.shape

(891, 3)

In [118]:
total_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [119]:
# VERIFICO SI HAY DATOS NULOS 
total_data.isnull().sum().sort_values(ascending=False)

package_name    0
review          0
polarity        0
dtype: int64

In [120]:
duplicados = total_data[total_data.duplicated(keep=False)]

print(duplicados)

                   package_name  \
212  com.supercell.clashofclans   
222  com.supercell.clashofclans   

                                                review  polarity  
212  this update sucks!!! i can't open the game any...         0  
222  this update sucks!!! i can't open the game any...         0  


In [121]:
total_data = total_data.drop_duplicates()
print(len(total_data))

890


In [122]:
#VARIABLES A ELIMINAR 
total_data.drop(['package_name'], axis = 1, inplace = True)
total_data.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_data.drop(['package_name'], axis = 1, inplace = True)


(890, 2)

In [123]:
x = total_data.drop(columns=[target])
y = total_data[target]

x_train_total_data, x_test_total_data, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(x_train_total_data)
print(x_test_total_data)


     polarity
709         0
240         0
382         0
793         1
673         1
..        ...
106         0
271         0
861         0
436         0
102         0

[712 rows x 1 columns]
     polarity
281         0
435         0
39          0
418         0
586         1
..        ...
433         0
773         0
25          0
84          0
10          0

[178 rows x 1 columns]


In [124]:
vec_model = CountVectorizer(stop_words = "english" , max_features=10000, lowercase=True, token_pattern=r'\b\w+\b' ) 
X_train = vec_model.fit_transform(x_train_total_data).toarray()
X_test = vec_model.transform(x_test_total_data).toarray()

text_data = total_data['review']

X_counts = vec_model.fit_transform(text_data)

print("Matriz de recuento de palabras (shape):", X_counts.shape)
print(X_counts[:5,:10].toarray())

Matriz de recuento de palabras (shape): (890, 3768)
[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [125]:
# ESTUDIO DE MODELOS 

# Vectorizar texto
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(total_data['review'])

# Dividir datos
x_train, x_test, y_train, y_test = train_test_split(X_counts, total_data['polarity'], test_size=0.2, random_state=42)

# MultinomialNB
mnb = MultinomialNB() # stratify=y balancea clases
mnb.fit(x_train, y_train)
y_pred_mnb = mnb.predict(x_test)
acc_mnb = accuracy_score(y_test, y_pred_mnb)

# BernoulliNB (usa datos binarios)
vectorizer_bin = CountVectorizer(binary=True)
X_counts_bin = vectorizer_bin.fit_transform(total_data['review'])
x_train_bin, x_test_bin, y_train_bin, y_test_bin = train_test_split(X_counts_bin, total_data['polarity'], test_size=0.2, random_state=42)
bnb = BernoulliNB()
bnb.fit(x_train_bin, y_train_bin)
y_pred_bnb = bnb.predict(x_test_bin)
acc_bnb = accuracy_score(y_test_bin, y_pred_bnb)

# GaussianNB (requiere matriz densa)
X_dense = X_counts.toarray()  # convertir a matriz densa
x_train_dense, x_test_dense, y_train_dense, y_test_dense = train_test_split(X_dense, total_data['polarity'], test_size=0.2, random_state=42)
gnb = GaussianNB()
gnb.fit(x_train_dense, y_train_dense)
y_pred_gnb = gnb.predict(x_test_dense)
acc_gnb = accuracy_score(y_test_dense, y_pred_gnb)

print("Accuracy MultinomialNB:", acc_mnb)
print("Accuracy BernoulliNB:", acc_bnb)
print("Accuracy GaussianNB:", acc_gnb)

Accuracy MultinomialNB: 0.8314606741573034
Accuracy BernoulliNB: 0.8426966292134831
Accuracy GaussianNB: 0.8033707865168539


Analisis de resultados: usaremos BernoulliNB que tuvo un mayor accurasy con 84,2% 

In [130]:
# Modelo y grid de hiperparámetros MultinomialNB
mnb = MultinomialNB()
param_grid = {
    'alpha': np.logspace(-3, 1, 20),   # suavizado Laplace
    'fit_prior': [True, False]         # usar o no priors
}

grid = GridSearchCV(
    estimator=mnb,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid.fit(x_train, y_train)# Entrenar 

# Resultados
print("Mejor accuracy en validación (CV):", grid.best_score_)

# Evaluación en test
best_mnb = grid.best_estimator_
y_pred = best_mnb.predict(x_test)
print("Accuracy en test:", accuracy_score(y_test, y_pred))
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))

Mejor accuracy en validación (CV): 0.8201910765291046
Accuracy en test: 0.848314606741573
Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89       128
           1       0.72      0.76      0.74        50

    accuracy                           0.85       178
   macro avg       0.81      0.82      0.82       178
weighted avg       0.85      0.85      0.85       178



Analisis de resultado: El Modelo mejoro a un 85% de acertividad