# Explore here

In [3]:
# Your code here
import pandas as pd

total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
total_data.head(5)

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [4]:
total_data = total_data.drop(columns = ["package_name"], axis = 1)

In [5]:
print("Total missing values for each column:")
print(total_data.isnull().sum(axis=0))

Total missing values for each column:
review      0
polarity    0
dtype: int64


In [6]:
print(f"Dimensions before dropping duplicates: {total_data.shape}")

total_data = total_data.drop_duplicates()

print(f"Dimensions after dropping duplicates: {total_data.shape}")

Dimensions before dropping duplicates: (891, 2)
Dimensions after dropping duplicates: (891, 2)


Removing spaces and converting the text to lowercase:

In [7]:
total_data["review"] = total_data["review"].str.strip().str.lower()

Divide the dataset into train and test:

In [8]:
from sklearn.model_selection import train_test_split

X = total_data["review"]
y = total_data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Transform the text into a word count matrix

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

### Model Implementation (experimental)

strong speculation this model is best suited for a Multinomial distribution as the features seem to be discrete, we will test the accuracy score of each

In [10]:
# initiating models

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

g_nb = GaussianNB()
g_nb.fit(X_train, y_train)

m_nb = MultinomialNB()
m_nb.fit(X_train, y_train)

b_nb = BernoulliNB()
b_nb.fit(X_train, y_train)


Acuuracy predictions on Base setting Models

In [11]:
g_y_pred = g_nb.predict(X_test)

m_y_pred = m_nb.predict(X_test)

b_y_pred = b_nb.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score

g_y_accuracy = accuracy_score(y_test, g_y_pred)
print(f'Gaussian Accuracy: {g_y_accuracy}')

m_y_accuracy = accuracy_score(y_test, m_y_pred)
print(f'Multinomial Accuracy: {m_y_accuracy}')

b_y_accuracy = accuracy_score(y_test, b_y_pred)
print(f'Bernoulli Accuracy: {b_y_accuracy}')

Gaussian Accuracy: 0.8212290502793296
Multinomial Accuracy: 0.8212290502793296
Bernoulli Accuracy: 0.7653631284916201


In [13]:
# multinomial vs gaussian percentage accuracy

print((m_y_accuracy-g_y_accuracy)* 100)

0.0


In [17]:
# confirming data type

print(X_train.dtype)

print("int64, are indeed discrete, as they represent exact, countable values without any intermediate values between them")

int64
int64, are indeed discrete, as they represent exact, countable values without any intermediate values between them


We can see that the Multinomial and Gaussian models are equally compatible with this data set,
we will still opt for Multinomial as it specifically works with discrete data

### Gaussian Hyperparameter Tuning

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}
g_nb = GaussianNB()

grid_search = GridSearchCV(estimator=g_nb, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'var_smoothing': 1e-09}


note: we learn, after documentation from the sklearn files, that the best var_smoothing settings are in fact the default, therefore, we accept the model

In [19]:
print(f'Gaussian Accuracy: {g_y_accuracy}')

Gaussian Accuracy: 0.8212290502793296


Step 6: Explore other alternatives

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Gradient Boosting
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print("Gradient Boosting Accuracy:", gb_accuracy)

# Logistic Regression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy:", lr_accuracy)

Gradient Boosting Accuracy: 0.7932960893854749
Logistic Regression Accuracy: 0.8547486033519553
